# 이상치 유지(제거 X) 버전

# 1. 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.display import Image

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error

from model import *
from sklearn.tree import plot_tree

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
padua_data = pd.read_csv('../data/kideny_PADUA_upload.csv')
renal_data = pd.read_csv('../data/kideny_RENAL_upload.csv')
treat_data = pd.read_csv('../data/kidney_treat_upload.csv')

# 2. 데이터 정리
중복 컬럼 삭제(종양의 크기, 외성장 비율) 및 데이터 합치기

In [3]:
# padua_data, renal_data 데이터 merge
merged_data = pd.merge(padua_data, renal_data, on='serial')
# padua 에서 중복 컬럼 삭제
merged_data = merged_data.drop(['Tumor_PADUA', 'Exophytic_PADUA'], axis=1)
# treat 데이터 merge
merged_result = pd.merge(merged_data, treat_data, on='serial', how='left')
# serial을 인덱스로 설정
merged_result.set_index('serial', inplace=True)

In [4]:
merged_result = merged_result.rename(columns={'Radius_RENAL': 'T_size', 'Exophytic_RENAL':'T_exophytic',
                            'Rim_PADUA':'T_surface_loc','Polarlocation_PADUA':'T_height_loc',
                            'Collectingsystem_PADUA':'T_GM_over','Sinus_PADUA':'T_Sinus_dt',
                            'Nearness_RENAL':'T_near_Ureter','APX_RENAL':'T_APX','Location_RENAL':'T_rel_Ureter','RAIV_1': 'RAIV'
                            })

In [5]:
merged_result = merged_result.dropna()

In [6]:
merged_result['Total'] = merged_result[['PADUA_total', 'RENAL_total']].sum(axis=1)
merged_result.drop(columns=['PADUA_total', 'RENAL_total'], inplace=True)

In [7]:
# 새로운 컬럼(T_size + T_Exophytic = T_size_exo )
merged_result['T_size_exo'] = merged_result['T_size'] + merged_result['T_exophytic']

In [8]:
merged_result = merged_result[['T_surface_loc', 'T_Sinus_dt', 'T_height_loc', 'T_GM_over', 'T_size', 'T_exophytic', 'T_near_Ureter', 'T_rel_Ureter', 'T_APX', 'Total', 'T_size_exo', 'RAIV']]
merged_result

Unnamed: 0_level_0,T_surface_loc,T_Sinus_dt,T_height_loc,T_GM_over,T_size,T_exophytic,T_near_Ureter,T_rel_Ureter,T_APX,Total,T_size_exo,RAIV
serial,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,12,3.0,51.1
2,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,13,3.0,46.1
3,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,10,2.0,14.8
4,1.0,1.0,2.0,2.0,1.0,1.0,3.0,3.0,2.0,13,2.0,19.7
5,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,14.8
...,...,...,...,...,...,...,...,...,...,...,...,...
408,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,21.3
409,1.0,1.0,2.0,1.0,1.0,2.0,1.0,3.0,1.0,12,3.0,30.5
410,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,11,3.0,70.7
411,2.0,2.0,1.0,2.0,1.0,2.0,2.0,1.0,2.0,15,3.0,12.6


In [9]:
merged_result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 389 entries, 1 to 412
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   T_surface_loc  389 non-null    float64
 1   T_Sinus_dt     389 non-null    float64
 2   T_height_loc   389 non-null    float64
 3   T_GM_over      389 non-null    float64
 4   T_size         389 non-null    float64
 5   T_exophytic    389 non-null    float64
 6   T_near_Ureter  389 non-null    float64
 7   T_rel_Ureter   389 non-null    float64
 8   T_APX          389 non-null    float64
 9   Total          389 non-null    int64  
 10  T_size_exo     389 non-null    float64
 11  RAIV           389 non-null    float64
dtypes: float64(11), int64(1)
memory usage: 39.5 KB


RAIV80이상인 데이터없는 데이터 새로 만들어줌
(merged_data는 살려둠)

# 3. 모델링

## 01. 데이터 나누기(train_test_split)

In [10]:
X = merged_result.drop(['RAIV'], axis=1)  # 'RAIV_1' 열을 제외한 모든 열 선택
y = merged_result['RAIV']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

## X 데이터(RAIV X)로 군집 0 과 1로 나눔 (이상치가 포함됨)

In [31]:
cluster_range = range(2, 11)

optimal_clusters = 0
max_silhouette_score = -1

for n_clusters in cluster_range:
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=13)
    
    kmeans.fit(X)
    
    clusters = kmeans.predict(X)
    
    silhouette_avg = silhouette_score(X, clusters)
    
    if silhouette_avg > max_silhouette_score:
        max_silhouette_score = silhouette_avg
        optimal_clusters = n_clusters

# Print the optimal number of clusters
print("Optimal number of clusters:", optimal_clusters)

Optimal number of clusters: 2


In [32]:
kmeans = KMeans(n_clusters=2, random_state=13)
kmeans.fit(X)

cluster = kmeans.predict(X)          # X데이터 군집화

print('cluster_labels :', cluster)

cluster_labels : [0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0
 1 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1
 0 1 1 0 1 1 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 0 1 0 1 1 1 1 0 1
 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1
 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 1 1 0 1 1 1 0 0 1 0 1 1
 0 1 1 1 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 0 0 0
 0 1 1 0 0 0 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 0 1
 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 0 1
 1 0 1 1 1 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0
 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0]


In [33]:
# 군집 레이블 생성
labels = kmeans.labels_

In [34]:
# 군집0
X_0 = X[labels == 0]
y_0 = y[labels == 0]

#군집1
X_1 = X[labels == 1]
y_1 = y[labels == 1]

In [35]:
y_0

serial
1      51.1
2      46.1
3      14.8
4      19.7
5      14.8
       ... 
407    24.6
408    21.3
409    30.5
410    70.7
412    11.8
Name: RAIV, Length: 205, dtype: float64

## 군집 0 데이터 나누기

In [36]:
from sklearn.model_selection import train_test_split

X_tr_0, X_test_0, y_tr_0, y_test_0 = train_test_split(X_0, y_0, test_size=0.2, random_state=13)

In [37]:
tr_result_mae = []
test_result_mae = []
names = []

for name, model in models:
    # 모델 훈련
    model.fit(X_tr_0, y_tr_0)
    names.append(name)

    # 예측
    pred_tr = model.predict(X_tr_0)
    pred_test = model.predict(X_test_0)

    # MAE 계산
    mae_tr = mean_absolute_error(y_tr_0, pred_tr)
    mae_test = mean_absolute_error(y_test_0, pred_test)

    # 결과를 리스트에 추가
    tr_result_mae.append(mae_tr)
    test_result_mae.append(mae_test)

# 결과를 DataFrame으로 정리
result_pd_0 = pd.DataFrame({
    'Model': names,
    'train MAE_0': tr_result_mae,
    'test MAE_0': test_result_mae
})

result_pd_0

NameError: name 'models' is not defined

## 군집 1 데이터 나누기

In [None]:
from sklearn.model_selection import train_test_split

X_tr_1, X_test_1, y_tr_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=13)

In [None]:
tr_result_mae = []
test_result_mae = []
names = []

for name, model in models:
    # 모델 훈련
    model.fit(X_tr_1, y_tr_1)
    names.append(name)

    # 예측
    pred_tr = model.predict(X_tr_1)
    pred_test = model.predict(X_test_1)

    # MAE 계산
    mae_tr = mean_absolute_error(y_tr_1, pred_tr)
    mae_test = mean_absolute_error(y_test_1, pred_test)

    # 결과를 리스트에 추가
    tr_result_mae.append(mae_tr)
    test_result_mae.append(mae_test)

# 결과를 DataFrame으로 정리
result_pd_1 = pd.DataFrame({
    'Model': names,
    'train MAE_1': tr_result_mae,
    'test MAE_1': test_result_mae
})

# 출력 형식을 변경하여 지수 표기법 대신 소수점 형태로 표시
pd.options.display.float_format = '{:.5f}'.format

result_pd_1

Unnamed: 0,Model,train MAE_1,test MAE_1
0,LinearRegression,16.02185,19570010619446.332
1,DecisionTreeRegressor,14.76804,16.06774
2,RandomForestRegressor,9.19249,17.65304


In [None]:
result_data = pd.merge(result_pd, result_pd_0, on='Model')
merged_result_data = pd.merge(result_data, result_pd_1, on='Model')
merged_result_data

Unnamed: 0,Model,train MAE,test MAE,train MAE_0,test MAE_0,train MAE_1,test MAE_1
0,LinearRegression,15.89753,18.38438,16.47123,16.20082,16.02185,19570010619446.332
1,DecisionTreeRegressor,15.24716,19.10187,15.856,14.57521,14.76804,16.06774
2,RandomForestRegressor,9.52291,18.98277,11.43087,12.58443,9.19249,17.65304
