# 이상치 유지(제거 X) 버전

# 1. 데이터 불러오기

In [31]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import mean_absolute_error, mean_squared_error

from model import *
from sklearn.tree import plot_tree

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings(action='ignore')

In [32]:
padua_data = pd.read_csv('../data/kideny_PADUA_upload.csv')
renal_data = pd.read_csv('../data/kideny_RENAL_upload.csv')
treat_data = pd.read_csv('../data/kidney_treat_upload.csv')

# 2. 데이터 정리
중복 컬럼 삭제(종양의 크기, 외성장 비율) 및 데이터 합치기

In [33]:
# padua_data, renal_data 데이터 merge
merged_data = pd.merge(padua_data, renal_data, on='serial')
# padua 에서 중복 컬럼 삭제
merged_data = merged_data.drop(['Tumor_PADUA', 'Exophytic_PADUA'], axis=1)
# treat 데이터 merge
merged_result = pd.merge(merged_data, treat_data, on='serial', how='left')
# serial을 인덱스로 설정
merged_result.set_index('serial', inplace=True)

In [34]:
merged_result = merged_result.rename(columns={'Radius_RENAL': 'T_size', 'Exophytic_RENAL':'T_exophytic',
                            'Rim_PADUA':'T_surface_loc','Polarlocation_PADUA':'T_height_loc',
                            'Collectingsystem_PADUA':'T_GM_over','Sinus_PADUA':'T_Sinus_dt',
                            'Nearness_RENAL':'T_near_Ureter','APX_RENAL':'T_APX','Location_RENAL':'T_rel_Ureter','RAIV_1': 'RAIV'
                            })

In [35]:
merged_result = merged_result.dropna()

In [36]:
merged_result['Total'] = merged_result[['PADUA_total', 'RENAL_total']].sum(axis=1)
merged_result.drop(columns=['PADUA_total', 'RENAL_total'], inplace=True)

In [37]:
# 새로운 컬럼(T_size + T_Exophytic = T_size_exo )
merged_result['T_size_exo'] = merged_result['T_size'] + merged_result['T_exophytic']

In [38]:
merged_result = merged_result[['T_surface_loc', 'T_Sinus_dt', 'T_height_loc', 'T_GM_over', 'T_size', 'T_exophytic', 'T_near_Ureter', 'T_rel_Ureter', 'T_APX', 'Total', 'T_size_exo', 'RAIV']]
merged_result

Unnamed: 0_level_0,T_surface_loc,T_Sinus_dt,T_height_loc,T_GM_over,T_size,T_exophytic,T_near_Ureter,T_rel_Ureter,T_APX,Total,T_size_exo,RAIV
serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,12,3.0,51.1
2,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,13,3.0,46.1
3,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,10,2.0,14.8
4,1.0,1.0,2.0,2.0,1.0,1.0,3.0,3.0,2.0,13,2.0,19.7
5,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,14.8
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,21.3
409,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,30.5
410,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,11,3.0,70.7
411,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,15,3.0,12.6


In [39]:
merged_result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 389 entries, 1 to 412
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   T_surface_loc  389 non-null    float64
 1   T_Sinus_dt     389 non-null    float64
 2   T_height_loc   389 non-null    float64
 3   T_GM_over      389 non-null    float64
 4   T_size         389 non-null    float64
 5   T_exophytic    389 non-null    float64
 6   T_near_Ureter  389 non-null    float64
 7   T_rel_Ureter   389 non-null    float64
 8   T_APX          389 non-null    float64
 9   Total          389 non-null    int64  
 10  T_size_exo     389 non-null    float64
 11  RAIV           389 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 39.5 KB


RAIV80이상인 데이터없는 데이터 새로 만들어줌
(merged_data는 살려둠)

# 3. 모델링

## 01. K-means 군집화(merged_result(RAIV 컬럼 포함))

In [40]:
cluster_range = range(2, 11)

optimal_clusters = 0
max_silhouette_score = -1

for n_clusters in cluster_range:
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=13)
    
    kmeans.fit(merged_result)
    
    clusters = kmeans.predict(merged_result)
    
    silhouette_avg = silhouette_score(merged_result, clusters)
    
    if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        optimal_clusters = n_clusters

# Print the optimal number of clusters
print("Optimal number of clusters:", optimal_clusters)

Optimal number of clusters: 2


In [41]:
# 군집화 수행
kmeans = KMeans(n_clusters=2, random_state=13)
kmeans.fit(merged_result)

# 군집 레이블 생성
labels = kmeans.labels_

# 군집 0과 1로 나누기
re_cluster_0 = merged_result[labels == 0]           # 군집0 X
re_cluster_1 = merged_result[labels == 1]           # 군집1 X

In [42]:
# Create a list of variable names
variable_names = ['T_surface_loc', 'T_Sinus_dt', 'T_height_loc', 'T_GM_over', 'T_size', 'T_exophytic', 'T_near_Ureter', 'T_rel_Ureter', 'T_APX', 'Total', 'T_size_exo','RAIV']

# Loop through each variable and create a scatter plot for each cluster
for variable in variable_names:
    # Create a scatter plot for cluster 0
    trace_cluster_0 = go.Scatter(
        x=re_cluster_0[variable],
        y=re_cluster_0['RAIV'],
        mode='markers',
        name='re_cluster_0'
    )

    # Create a scatter plot for cluster 1
    trace_cluster_1 = go.Scatter(
        x=re_cluster_1[variable],
        y=re_cluster_1['RAIV'],
        mode='markers',
        name='mer_cluster_1'
    )

    # Create layout
    layout = go.Layout(
        title="Scatter Plot of {} vs RAIV".format(variable),
        xaxis=dict(title=variable),
        yaxis=dict(title='RAIV')
    )

    # Create figure and add traces
    fig = go.Figure(data=[trace_cluster_0, trace_cluster_1], layout=layout)

    # Show plot
    fig.show()

In [43]:
re_cluster_0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 307 entries, 3 to 412
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   T_surface_loc  307 non-null    float64
 1   T_Sinus_dt     307 non-null    float64
 2   T_height_loc   307 non-null    float64
 3   T_GM_over      307 non-null    float64
 4   T_size         307 non-null    float64
 5   T_exophytic    307 non-null    float64
 6   T_near_Ureter  307 non-null    float64
 7   T_rel_Ureter   307 non-null    float64
 8   T_APX          307 non-null    float64
 9   Total          307 non-null    int64  
 10  T_size_exo     307 non-null    float64
 11  RAIV           307 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 31.2 KB


In [44]:
re_cluster_1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82 entries, 1 to 410
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   T_surface_loc  82 non-null     float64
 1   T_Sinus_dt     82 non-null     float64
 2   T_height_loc   82 non-null     float64
 3   T_GM_over      82 non-null     float64
 4   T_size         82 non-null     float64
 5   T_exophytic    82 non-null     float64
 6   T_near_Ureter  82 non-null     float64
 7   T_rel_Ureter   82 non-null     float64
 8   T_APX          82 non-null     float64
 9   Total          82 non-null     int64  
 10  T_size_exo     82 non-null     float64
 11  RAIV           82 non-null     float64
dtypes: float64(11), int64(1)
memory usage: 8.3 KB


## 데이터비율
v1 이상치를 제거하지 않은 데이터 : 군집0 > 군집1 (307:82)

## 02. 군집 0 모델링

In [45]:
from sklearn.model_selection import train_test_split

X = re_cluster_0.drop(['RAIV'], axis=1)  # 'RAIV' 열을 제외한 모든 열 선택
y = re_cluster_0['RAIV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

- DecisionTreeRegressor

In [46]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_dt, best_params_dt = grid_search(X_train, y_train, "DecisionTreeRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_dt = tr_dt(X_train, y_train, **best_params_dt)

# 최적의 모델을 평가합니다
evaluate(best_dt, X_test, y_test)

MAE: 7.514935799255276
RMSE: 8.470621379888627


- RandomForestRegressor

In [47]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_rf, best_params_rf = grid_search(X_train, y_train, "RandomForestRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델 훈련합니다
best_rf = tr_rf(X_train, y_train, **best_params_rf)

# 최적의 모델을 평가
evaluate(best_rf, X_test, y_test)

MAE: 7.391764776608082
RMSE: 8.34565855867337


- KNeighborsRegressor

In [48]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_knn, best_params_knn = grid_search(X_train, y_train, "KNeighborsRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_knn = tr_knn(X_train, y_train, **best_params_knn)

# 최적의 모델을 평가합니다
evaluate(best_knn, X_test, y_test)

MAE: 7.342688172043011
RMSE: 8.349147875735346


- XGBRegressor

In [49]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_xgb, best_params_xgb = grid_search(X_train, y_train, "XGBoostRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_xgb = tr_xgb(X_train, y_train, **best_params_xgb)

# 최적의 모델을 평가합니다
evaluate(best_xgb, X_test, y_test)

MAE: 7.06217490780738
RMSE: 8.128736826249625


- AdaBoostRegressor

In [50]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_ada, best_params_ada = grid_search(X_train, y_train, "AdaBoostRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_ada = tr_adaboost(X_train, y_train, **best_params_ada)

# 최적의 모델을 평가합니다
evaluate(best_ada, X_test, y_test)

MAE: 7.428050601896336
RMSE: 8.331973321521623


- LGBMRegressor

In [51]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_lgb, best_params_lgb = grid_search(X_train, y_train, "LightGBM")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_lgb = tr_lgb(X_train, y_train, **best_params_lgb)

# 최적의 모델을 평가합니다
evaluate(best_lgb, X_test, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 11
[LightGBM] [Info] Start training from score 17.188265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 11
[LightGBM] [Info] Start training from score 17.767857
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 50
[LightGBM] [Info] Number of data points in the train set: 196, n

- GradientBoostingRegressor

In [52]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_gbm, best_params_gbm = grid_search(X_train, y_train, "GradientBoostingRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_gbm = tr_gbm(X_train, y_train, **best_params_gbm)

# 최적의 모델을 평가합니다
evaluate(best_gbm, X_test, y_test)

MAE: 7.107870631480871
RMSE: 8.154410190351088


## 군집1 모델링

In [53]:
from sklearn.model_selection import train_test_split

X = re_cluster_1.drop(['RAIV'], axis=1)  # 'RAIV' 열을 제외한 모든 열 선택
y = re_cluster_1['RAIV']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

- DecisionTreeRegressor

In [54]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_dt, best_params_dt = grid_search(X_train, y_train, "DecisionTreeRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_dt = tr_dt(X_train, y_train, **best_params_dt)

# 최적의 모델을 평가합니다
evaluate(best_dt, X_test, y_test)

MAE: 21.971266968325796
RMSE: 31.78767978448787


- RandomForestRegressor

In [55]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_rf, best_params_rf = grid_search(X_train, y_train, "RandomForestRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델 훈련합니다
best_rf = tr_rf(X_train, y_train, **best_params_rf)

# 최적의 모델을 평가
evaluate(best_rf, X_test, y_test)

MAE: 22.636070541161956
RMSE: 33.76234673462514


- KNeighborsRegressor

In [56]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_knn, best_params_knn = grid_search(X_train, y_train, "KNeighborsRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_knn = tr_knn(X_train, y_train, **best_params_knn)

# 최적의 모델을 평가합니다
evaluate(best_knn, X_test, y_test)

MAE: 22.78941176470588
RMSE: 32.454396964863506


- XGBRegressor

In [57]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_xgb, best_params_xgb = grid_search(X_train, y_train, "XGBoostRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_xgb = tr_xgb(X_train, y_train, **best_params_xgb)

# 최적의 모델을 평가합니다
evaluate(best_xgb, X_test, y_test)

MAE: 21.565481971291934
RMSE: 31.617604590424214


- AdaBoostRegressor

In [58]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_ada, best_params_ada = grid_search(X_train, y_train, "AdaBoostRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_ada = tr_adaboost(X_train, y_train, **best_params_ada)

# 최적의 모델을 평가합니다
evaluate(best_ada, X_test, y_test)

MAE: 22.501377589076604
RMSE: 33.454947976756245


- LGBMRegressor

In [59]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_lgb, best_params_lgb = grid_search(X_train, y_train, "LightGBM")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_lgb = tr_lgb(X_train, y_train, **best_params_lgb)

# 최적의 모델을 평가합니다
evaluate(best_lgb, X_test, y_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24
[LightGBM] [Info] Number of data points in the train set: 52, number of used features: 5
[LightGBM] [Info] Start training from score 63.876922
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000073 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 52, number of used features: 5
[LightGBM] [Info] Start training from score 62.684615
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23
[LightGBM] [Info] Number of data points in the train set: 52, number of used features: 5
[LightGBM] [Info] Start training from score

- GradientBoostingRegressor

In [60]:
# 그리드 서치를 사용하여 최적의 하이퍼파라미터를 찾습니다
best_model_gbm, best_params_gbm = grid_search(X_train, y_train, "GradientBoostingRegressor")

# 최적의 하이퍼파라미터를 사용하여 모델을 훈련합니다
best_gbm = tr_gbm(X_train, y_train, **best_params_gbm)

# 최적의 모델을 평가합니다
evaluate(best_gbm, X_test, y_test)

MAE: 20.67060114102536
RMSE: 31.321967722606292
