### Test_01 로지스틱 회귀분석

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

- train.csv [파일]
```
ID : 샘플별 고유 ID
Age : 환자의 나이
Gender : 성별
Country : 국적
Race : 인종
Family_Background : 가족력 여부
Radiation_History : 방사선 노출 이력
Iodine_Deficiency : 요오드 결핍 여부
Smoke : 흡연 여부
Weight_Risk : 체중 관련 위험도
Diabetes : 당뇨병 여부
Nodule_Size : 갑상선 결절 크기
TSH_Result : TSH 호르몬 검사 결과
T4_Result : T4 호르몬 검사 결과
T3_Result : T3 호르몬 검사 결과
Cancer : 갑상선암 여부 (0: 양성, 1: 악성)
```

### 범주형 데이터인 변수
```
Gender : 성별
Country : 국적
Race : 인종
Family_Background : 가족력 여부
Radiation_History : 방사선 노출 이력
Iodine_Deficiency : 요오드 결핍 여부
Smoke : 흡연 여부
Weight_Risk : 체중 관련 위험도
Diabetes : 당뇨병 여부
```

In [5]:
# 학습 데이터
train = pd.read_csv('open/train.csv')
# 검증 데이터
test = pd.read_csv('open/test.csv')

In [6]:
train.isnull().sum()
train.isna().sum()
# 결측치, null 값 둘 다 없음

ID                   0
Age                  0
Gender               0
Country              0
Race                 0
Family_Background    0
Radiation_History    0
Iodine_Deficiency    0
Smoke                0
Weight_Risk          0
Diabetes             0
Nodule_Size          0
TSH_Result           0
T4_Result            0
T3_Result            0
Cancer               0
dtype: int64

In [7]:
train.head()

Unnamed: 0,ID,Age,Gender,Country,Race,Family_Background,Radiation_History,Iodine_Deficiency,Smoke,Weight_Risk,Diabetes,Nodule_Size,TSH_Result,T4_Result,T3_Result,Cancer
0,TRAIN_00000,80,M,CHN,ASN,Positive,Exposed,Sufficient,Non-Smoker,Not Obese,No,0.650355,2.784735,6.744603,2.57582,1
1,TRAIN_00001,37,M,NGA,ASN,Positive,Unexposed,Sufficient,Smoker,Obese,No,2.95043,0.911624,7.303305,2.505317,1
2,TRAIN_00002,71,M,CHN,MDE,Positive,Unexposed,Sufficient,Non-Smoker,Not Obese,Yes,2.200023,0.717754,11.137459,2.38108,0
3,TRAIN_00003,40,F,IND,HSP,Negative,Unexposed,Sufficient,Non-Smoker,Obese,No,3.370796,6.84638,10.175254,0.753023,0
4,TRAIN_00004,53,F,CHN,CAU,Negative,Unexposed,Sufficient,Non-Smoker,Not Obese,No,4.230048,0.439519,7.19445,0.569356,1


In [8]:
train['Cancer'].value_counts(normalize=True)
# 0 : 양성, 1: 악성

Cancer
0    0.880001
1    0.119999
Name: proportion, dtype: float64

In [9]:
print('갑상선암 진단 학습 데이터셋 크기 : ', train.shape)
train.info()

갑상선암 진단 학습 데이터셋 크기 :  (87159, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87159 entries, 0 to 87158
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ID                 87159 non-null  object 
 1   Age                87159 non-null  int64  
 2   Gender             87159 non-null  object 
 3   Country            87159 non-null  object 
 4   Race               87159 non-null  object 
 5   Family_Background  87159 non-null  object 
 6   Radiation_History  87159 non-null  object 
 7   Iodine_Deficiency  87159 non-null  object 
 8   Smoke              87159 non-null  object 
 9   Weight_Risk        87159 non-null  object 
 10  Diabetes           87159 non-null  object 
 11  Nodule_Size        87159 non-null  float64
 12  TSH_Result         87159 non-null  float64
 13  T4_Result          87159 non-null  float64
 14  T3_Result          87159 non-null  float64
 15  Cancer             87159 non-null  i

### 데이터 라벨링

In [11]:
all_data = pd.concat([train.drop(columns=['Cancer']), test], axis=0)
label_cols = ['Gender', 'Family_Background', 'Radiation_History',
              'Iodine_Deficiency', 'Smoke', 'Diabetes', 'Weight_Risk']

# 1. Label Encoding 적용할 컬럼
le = LabelEncoder()
for col in label_cols:
    all_data[col] = le.fit_transform(all_data[col].astype(str))

# 2. One-Hot Encoding 적용할 컬럼
one_hot_cols = ['Country', 'Race']

all_data = pd.get_dummies(all_data, columns=one_hot_cols)

# 다시 train/test로 분할
train_encoded = all_data.iloc[:len(train), :]
train_encoded['Cancer'] = train['Cancer'].values
test_encoded = all_data.iloc[len(train):, :]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_encoded['Cancer'] = train['Cancer'].values


In [69]:
train_encoded.iloc[['07654']]

Unnamed: 0,ID,Age,Gender,Family_Background,Radiation_History,Iodine_Deficiency,Smoke,Weight_Risk,Diabetes,Nodule_Size,...,Country_KOR,Country_NGA,Country_RUS,Country_USA,Race_AFR,Race_ASN,Race_CAU,Race_HSP,Race_MDE,Cancer
7654,TRAIN_07654,72,1,0,1,1,0,0,1,0.020164,...,False,False,False,False,True,False,False,False,False,0


### 로지스틱 회귀 모델링

In [71]:
# X, Y 결정하기
Y = train_encoded['Cancer']
X = train_encoded.copy()
X = X.drop(['ID', 'Cancer'], axis  = 1)

In [73]:
# 훈련  데이터, 평가 데이터 분할
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [75]:
# 로지스틱 회귀 분석 : 모델 훈련
md = LogisticRegression()

md.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [85]:
# 로지스틱 회귀 분석 : 예측 결과로 Y_predict 구하기
pred = md.predict(X_test)

### 성능 평가하기

In [88]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [90]:
# 오차 행렬
confusion_matrix(Y_test, pred)

array([[22987,    46],
       [ 3065,    50]])

In [100]:
accuracy = accuracy_score(Y_test, pred)
precision = precision_score(Y_test, pred)
recall = recall_score(Y_test, pred)
f1 = f1_score(Y_test, pred)
roc_auc = roc_auc_score(Y_test, pred)

In [104]:
print(f'정확도 : {accuracy}, 정밀도 : {precision}, 재현율 : {recall}, f1스코어 : {f1}, roc_auc : {roc_auc}')

정확도 : 0.8810234052317577, 정밀도 : 0.5208333333333334, 재현율 : 0.016051364365971106, f1스코어 : 0.031142946122703206, roc_auc : 0.5070271149099426


```

-----------------------------------------------------------------------------------------------


```

 # Test_02 클래스 불균형 처리 추가
### Oversampling + 클래스 가중치 부여

In [195]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X_train, Y_train)

md = LogisticRegression(class_weight='balanced')

md.fit(X_res, Y_res)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [191]:
pred = md.predict(X_test)

In [193]:
f1 = f1_score(Y_test, pred)
print(f'f1 score : {f1}')

f1 score : 0.2205017033137194
