# 4주차 미션 - 새로운 데이터셋에 분류 모델 적용하기

----

## 📌 주제1 - Mobile Price Classification(Kaggle Data)
- 데이터 링크: https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### 미션 1) 데이터 로드와 피처엔지니어링 🔍📊
- 데이터셋을 탐색하여 기본적인 통계, 분포, 결측치 등을 확인합니다.
- 필요에 따라 결측치 처리, 이상치 제거, 데이터 형식 변환 등의 전처리 작업을 수행합니다.
- 피처 엔지니어링을 통해 데이터의 특성에 맞게 피처를 선택, 생성, 변환합니다. 예를 들어, 범주형 데이터의 경우 One-hot encoding을 적용할 수 있습니다.

|컬럼명|설명|컬럼명|설명|
|--|--|--|--|
|'battery_power'|Total energy a battery can store in one time measured in mAh|'px_height'|Pixel Resolution Height|
|'blue'|Has bluetooth or not|'px_width'|Pixel Resolution Width|
|'clock_speed'|speed at which microprocessor executes instructions|'ram'|Random Access Memory in Mega Bytes|
|'dual_sim'|Has dual sim support or not|'sc_h'|Screen Height of mobile in cm|
|'fc'|Front Camera mega pixels|'sc_w'|Screen Width of mobile in cm|
|'four_g'|Has 4G or not|'talk_time'|longest time that a single battery charge will last when you are|
|'int_memory'|Internal Memory in Gigabytes|'three_g'|Has 3G or not|
|'m_dep'|Mobile Depth in cm|'touch_screen'|Has touch screen or not|
|'mobile_wt'|Weight of mobile phone|'wifi'|Has wifi or not|
|'n_cores'|Number of cores of processor|'price_range'|This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).|
|'pc'|Primary Camera mega pixels| | |

#### 1. 데이터 불러오기

In [None]:
train_df = pd.read_csv("../data/archive/train.csv")
train_df.shape

#### 2. 데이터 확인하기

In [None]:
# 컬럼 확인
train_df.columns

In [None]:
# 데이터 타입과 결측치 확인
train_df.info()

In [None]:
train_df.describe()

In [None]:
# 컬럼별 hist
train_df.hist(figsize=(20,15))
plt.show()

In [None]:
# 컬럼별 countplot
plt.figure(figsize=(18,30))
for i, column in enumerate(train_df.columns[:-1]):
    plt.subplot(10, 2, i+1)
    sns.countplot(x=column, data=train_df, palette='viridis')
    plt.title(column)
    plt.xlabel('')
    plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# 컬럼별 상관관계
plt.figure(figsize= (15, 8), facecolor='lightblue')
sns.heatmap(train_df.corr(), annot=True , fmt=".2f")

### 미션 2) 모델 선택 🧠
- 분류 문제에 적합한 머신러닝 알고리즘을 선택합니다. 예를 들어, 로지스틱 회귀, 결정 트리, 랜덤 포레스트, 서포트 벡터 머신(SVM), 그라디언트 부스팅 등이 있습니다. 베이스라인 모델 생성하여 성능의 기준점을 설정합니다.

### 미션 3) 모델 훈련 🤖
- 데이터셋을 훈련 세트와 테스트 세트로 분할합니다. 일반적으로 데이터의 70-80%를 훈련용으로, 나머지를 테스트용으로 사용합니다. 모델에 학습 데이터셋과 정답을 학습시키고 예측 데이터셋으로 예측을 진행합니다.

### 미션 4) 하이퍼파라미터 튜닝 ⚙️
- 그리드 서치, 랜덤 서치 등을 활용하여 모델의 하이퍼파라미터를 조정하여 모델의 성능을 최적화합니다. 이 때, 모델의 일반화 성능을 평가하기 위해 교차 검증을 수행합니다.
(참고) xgboost, lightGBM, catBoost는 하이퍼파라미터 튜닝 기능을 제공합니다.

### 미션 5) 모델 평가 및 선택 🎯
- 학습된 모델을 테스트 데이터 또는 새로운 데이터에 적용하여 성능을 평가합니다. 이때 사용할 평가 지표는 태스크에 따라 다를 수 있습니다. 정확도(Accuracy), 정밀도(Precision), 재현율(Recall), F1-score 등을 고려합니다.

In [None]:
# train, test 데이터셋 분리
from sklearn.model_selection import train_test_split

# 모델 선정
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# 스케일링
from sklearn.preprocessing import StandardScaler

# 하이퍼 파라미터 튜닝
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# 모델 평가
from sklearn.metrics import accuracy_score

#### 1.결정트리

##### 1.1 스케일링 x

In [None]:
# 독립변수, 종속변수 분류
X = train_df.drop("price_range", axis=1)
y = train_df["price_range"].values.ravel()

# train, test 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 선정
clf = DecisionTreeClassifier(random_state=42)

# 하이퍼 파라미터 튜닝
param_grid = {"max_depth":range(3,12),
              "max_features": [0.3, 0.5, 0.7, 0.9, 1]}

max_depth = np.random.randint(3, 20, 10)
max_features = np.random.uniform(0.7, 1.0, 100)
param_distributions = {"max_depth" :max_depth,
           "max_features": max_features,
           "min_samples_split" : list(range(2, 7))}

model = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0)

model2 = RandomizedSearchCV(clf, 
                            param_distributions,
                            n_iter=1000,
                            scoring="accuracy",
                            n_jobs=-1,
                            cv=5,
                            random_state=42)

# 모델 훈련 - model: GridSearch, model2: RandomSearchCV
model.fit(X_train, y_train)
model2.fit(X_train, y_train)

# 모델 예측
y_pred = model.predict(X_test)
y_pred2 = model2.predict(X_test)

# 모델 평가
score = accuracy_score(y_test , y_pred)
score2 = accuracy_score(y_test , y_pred2)

print("< DecisionTreeClassifier> ")
print("---" * 50)
print(f"GridSearchCV 최적의 파라미터 : {model.best_params_}")
print(f"GridSearchCV 최고 정확도 : {round(model.best_score_, 3)}")
print(f"GridSearchCV accuracy_score : {round(score, 3)}")
print("---" * 50)
print(f"RandomSearchCV 최적의 파라미터 : {model2.best_params_}")
print(f"RandomSearchCV 최고 정확도 : {round(model2.best_score_, 3)}")
print(f"RandomSearchCV accuracy_score : {round(score2, 3)}")

##### 1.2 스케일링 o

In [None]:
# 독립변수, 종속변수 분류
X = train_df.drop("price_range", axis=1)
y = train_df["price_range"].values.ravel()

# 스케일링
Scaler = StandardScaler()
X_norm = Scaler.fit_transform(X)

# train, test 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

# 모델 선정
clf_scal = DecisionTreeClassifier(random_state=42)

# 하이퍼 파라미터 튜닝
param_grid = {"max_depth":range(3,12),
              "max_features": [0.3, 0.5, 0.7, 0.9, 1]}

max_depth = np.random.randint(3, 20, 10)
max_features = np.random.uniform(0.7, 1.0, 100)
param_distributions = {"max_depth" :max_depth,
           "max_features": max_features,
           "min_samples_split" : list(range(2, 7))}

model_clf = GridSearchCV(clf_scal, param_grid=param_grid, n_jobs=-1, cv=5, verbose=0)

model_clf2 = RandomizedSearchCV(clf_scal, 
                            param_distributions,
                            n_iter=1000,
                            scoring="accuracy",
                            n_jobs=-1,
                            cv=5,
                            random_state=42)

# 모델 훈련 - model: GridSearch, model2: RandomSearchCV
model_clf.fit(X_train, y_train)
model_clf2.fit(X_train, y_train)

# 모델 예측
y_pred = model_clf.predict(X_test)
y_pred2 = model_clf2.predict(X_test)

# 모델 평가
score_clf = accuracy_score(y_test , y_pred)
score_clf2 = accuracy_score(y_test , y_pred2)

print("< DecisionTreeClassifier > ")
print("---" * 50)
print(f"GridSearchCV 최적의 파라미터 : {model_clf.best_params_}")
print(f"GridSearchCV 최고 정확도 : {round(model_clf.best_score_, 3)}")
print(f"GridSearchCV accuracy_score : {round(score_clf, 3)}")
print("---" * 50)
print(f"RandomSearchCV 최적의 파라미터 : {model_clf2.best_params_}")
print(f"RandomSearchCV 최고 정확도 : {round(model_clf2.best_score_, 3)}")
print(f"RandomSearchCV accuracy_score : {round(score_clf2, 3)}")

#### 2.서포트 벡터 머신(SVM)

In [None]:
# 모델 선정
clf_svm = SVC(random_state=42)

# 하이퍼 파라미터 선정
param_svm = {'C': [0.1, 1, 10, 100],
                  'gamma': [0.01, 0.1, 1, 10],
                  'kernel': ['linear', 'rbf', 'poly']}

model_svm = GridSearchCV(clf_svm, param_grid=param_svm, n_jobs=-1, cv=5, verbose=0)
model_svm2 = RandomizedSearchCV(clf_svm, 
                                param_distributions=param_svm, 
                                n_iter=100, 
                                scoring="accuracy", 
                                n_jobs=-1, 
                                cv=5, 
                                random_state=42)

# 모델 훈련
model_svm.fit(X_train, y_train)
model_svm2.fit(X_train, y_train)

# 모델 예측
y_pred_svm = model_svm.predict(X_test)
y_pred_svm2 = model_svm2.predict(X_test)

# 모델 평가
score_svm = accuracy_score(y_test, y_pred_svm)
score_svm2 = accuracy_score(y_test, y_pred_svm)

print("< Support Vector Machine (SVM) >")
print("---" * 50)
print(f"GridSearchCV 최적의 파라미터 : {model_svm.best_params_}")
print(f"GridSearchCV 최고 정확도 : {round(model_svm.best_score_, 3)}")
print(f"GridSearchCV accuracy_score : {round(score_svm, 3)}")
print("---" * 50)
print(f"RandomizedSearchCV 최적의 파라미터 : {model_svm2.best_params_}")
print(f"RandomizedSearchCV 최고 정확도 : {round(model_svm2.best_score_, 3)}")
print(f"RandomizedSearchCV accuracy_score : {round(score_svm2, 3)}")

#### 3.랜덤 포레스트

In [None]:
# 모델 선정
clf_rf = RandomForestClassifier(random_state=42)

# 하이퍼 파라미터 선정
param_rf = {'n_estimators': [50, 100, 200],
                 'max_depth': [3, 6, 9],
                 'max_features': [0.3, 0.5, 0.7, 0.9, 1]}

model_rf = GridSearchCV(clf_rf, param_grid=param_rf, n_jobs=-1, cv=5, verbose=0)
model_rf2 = RandomizedSearchCV(clf_rf, 
                              param_distributions=param_rf, 
                              n_iter=150, 
                              scoring="accuracy", 
                              n_jobs=-1, 
                              cv=5, 
                              random_state=42)

# 모델 훈련
model_rf.fit(X_train, y_train)
model_rf2.fit(X_train, y_train)

# 모델 예측
y_pred_rf = model_rf.predict(X_test)
y_pred_rf2 = model_rf2.predict(X_test)

# 모델 평가 
score_rf = accuracy_score(y_test, y_pred_rf)
score_rf2 = accuracy_score(y_test, y_pred_rf2)


print("< Random Forest >")
print("---" * 50)
print(f"GridSearchCV 최적의 파라미터 : {model_rf.best_params_}")
print(f"GridSearchCV 최고 정확도 : {round(model_rf.best_score_, 3)}")
print(f"GridSearchCV accuracy_score : {round(score_rf, 3)}")
print("---" * 50)
print(f"RandomizedSearchCV 최적의 파라미터 : {model_rf2.best_params_}")
print(f"RandomizedSearchCV 최고 정확도 : {round(model_rf2.best_score_, 3)}")
print(f"RandomizedSearchCV accuracy_score : {round(score_rf2, 3)}")

#### 4.로지스틱 회귀

In [None]:
# 모델 선정
clf_lr = LogisticRegression(random_state=42)

# 하이퍼 파라미터 선정
param_lr = {'C': [0.1, 1, 10, 100],
                 'penalty': ['l1', 'l2']}

model_lr = GridSearchCV(clf_lr, param_grid=param_lr, n_jobs=-1, cv=5, verbose=0)
model_lr2 = RandomizedSearchCV(clf_lr,
                               param_distributions=param_lr, 
                               n_iter=100, 
                               scoring="accuracy", 
                               n_jobs=-1, 
                               cv=5, 
                               random_state=42)

# 모델 훈련
model_lr.fit(X_train, y_train)
model_lr2.fit(X_train, y_train)

# 모델 예측
y_pred_lr = model_lr.predict(X_test)
y_pred_lr2 = model_lr2.predict(X_test)

# 모델 평가 
score_lr = accuracy_score(y_test, y_pred_lr)
score_lr2 = accuracy_score(y_test, y_pred_lr2)

print("< Logistic Regression >")
print("---" * 50)
print(f"GridSearchCV 최적의 파라미터 : {model_lr.best_params_}")
print(f"GridSearchCV 최고 정확도 : {round(model_lr.best_score_, 3)}")
print(f"GridSearchCV accuracy_score : {round(score_lr, 3)}")
print("---" * 50)
print(f"RandomizedSearchCV 최적의 파라미터 : {model_lr2.best_params_}")
print(f"RandomizedSearchCV 최고 정확도 : {round(model_lr2.best_score_, 3)}")
print(f"RandomizedSearchCV accuracy_score : {round(score_lr2, 3)}")

#### 5.결론

In [None]:
idx = ["DecisionTreeClassifier Scaler X",
        "DecisionTreeClassifier Scaler O", 
        "Support Vector Machine", 
        "Random Forest", 
        "Logistic Regression"]

point = {
    "GridSearchCV 최적의 파라미터" : [model.best_params_, model_clf.best_params_, model_svm, model_rf, model_lr],
    "GridSearchCV 최고 정확도" : [round(model.best_score_, 3), round(model_clf.best_score_, 3), round(model_svm.best_score_, 3), round(model_rf.best_score_, 3) round(model_lr.best_score_, 3)],
    "GridSearchCV accuracy_score" : [round(score, 3), round(score_clf, 3), round(score_svm, 3), round(score_rf, 3), round(score_lr, 3)],
    "RandomizedSearchCV 최적의 파라미터" : [model2.best_params_, model_clf2.best_params_, model_svm2.best_params_, model_rf2.best_params_, model_lr2.best_params_, ],
    "RandomizedSearchCV 최고 정확도" : [round(model2.best_score_, 3), round(model_clf2.best_score_, 3), round(model_svm2.best_score_, 3), round(model_rf2.best_score_, 3), round(model_lr2.best_score_, 3), ],
    "RandomizedSearchCV accuracy_score" : [round(score2, 3), round(score_clf2, 3), round(score_svm2, 3), round(score_rf2, 3), round(score_lr2, 3), ]
}

df = pd.DataFrame(data=point, index=idx)
df

------

----

## 📌 주제 2 - 

### 미션 1) 데이터 로드와 피처엔지니어링

In [None]:
df =pd.read_csv('/content/drive/MyDrive/archive/credit_score.csv')

In [None]:
df.columns

In [None]:
df.info()

In [None]:
set(df["CAT_GAMBLING"])

In [None]:
# 3진 데이터 라벨
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["CAT_GAMBLING"] = label_encoder.fit_transform(df["CAT_GAMBLING"])
set(df["CAT_GAMBLING"])

In [None]:
feature_list=df.columns[0:-1].tolist()
cols=feature_list[1:]

In [None]:
df.columns[0::]

In [None]:
list(df.columns[0::])

In [None]:
df['DEFAULT'].value_counts()

In [None]:
df['CREDIT_SCORE'].value_counts()

In [None]:
df.groupby(['CREDIT_SCORE'])['DEFAULT'].mean()

In [None]:
sns.countplot(data=df,x='CREDIT_SCORE',hue='DEFAULT')

In [None]:
#DEFAULT==0 이면 채무를 이행한것

In [None]:
g=sns.PairGrid(df,hue='DEFAULT')
g.map(plt.scatter)

In [None]:
feature_list=df.columns[1:-2].tolist()
feature_list

In [None]:
split_count=int(df.shape[0]*0.8)
train=df[:split_count].copy()

test=df[split_count:].copy()

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()

x_train=train[feature_list]
y_train=train['DEFAULT']
print(y_train.shape)
model.fit(x_train,y_train)

from sklearn.tree import plot_tree
plt.figure(figsize=(20,20))
sns.barplot(x=model.feature_importances_,y=feature_list)


In [None]:
sns.distplot(df['R_GROCERIES_INCOME'])

In [None]:
df['R_GROCERIES_INCOME_log']=np.log(df['R_GROCERIES_INCOME']+1)
sns.distplot(df['R_GROCERIES_INCOME_log'])

In [None]:
# 여기에 상관계수를 구하고 heatmap 으로 시각화 해주세요.

df_subset = df[feature_list]

correlation_matrix = df_subset.corr()

plt.figure(figsize=(80, 80))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

plt.show()

In [None]:
correlation = correlation_matrix['R_GROCERIES_INCOME'].sort_values(ascending=False)

In [None]:
from itertools import combinations
numeric_columns = df.select_dtypes(include='number').columns

# All combinations of numeric variables
variable_combinations = list(combinations(numeric_columns, 2))

# Calculate correlations and sort
correlation_list = []
for var1, var2 in variable_combinations:
    correlation = df[var1].corr(df[var2])
    correlation_list.append(((var1, var2), correlation))

correlation_list.sort(key=lambda x: abs(x[1]), reverse=True)

# Print the results
for (var1, var2), correlation in correlation_list:
  if correlation>0.7:
    print(f"({var1}, {var2}): {correlation}")

In [None]:
X=df[feature_list]
y=df['DEFAULT']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 미션 2) 모델 선택

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

models=[DecisionTreeClassifier(random_state=42),
       RandomForestClassifier(random_state=42),
       GradientBoostingClassifier(random_state=42)]

### 미션 3) 모델 훈련

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions={'max_depth':max_depth,'max_features':max_features}
result=[]
for model in models:
  if model.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 1000, 10)

  clf=RandomizedSearchCV(model,
                    param_distributions,
                    n_iter=10,
                    scoring='accuracy',
                    n_jobs=-1,
                    cv=5,
                    verbose=2)
  clf.fit(X_train,y_train)
  result.append([model.__class__.__name__,clf.best_params_,
                 clf.best_score_,clf.score(X_test, y_test),clf.cv_results_])

In [None]:
pd.DataFrame(result)

In [None]:
#전처리 전 스코어max_features=np.random.uniform(0.3,1.0,10)
pd.DataFrame(result, columns=["model", "best_params", "train_score", "test_score", "cv_result"])

### 미션 4) 하이퍼파라미터 튜닝

In [None]:
max_features=np.random.uniform(0.5,1.0,10)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
param_distributions={'max_depth':max_depth,'max_features':max_features}
result=[]
for model in models:
  if model.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 1000, 10)

  clf=RandomizedSearchCV(model,
                    param_distributions,
                    n_iter=50,
                    scoring='accuracy',
                    n_jobs=-1,
                    cv=5,
                    verbose=2)
  clf.fit(X_train,y_train)
  result.append([model.__class__.__name__,clf.best_params_,
                 clf.best_score_,clf.score(X_test, y_test),clf.cv_results_])


In [None]:
pd.DataFrame(result, columns=["model", "best_params", "train_score", "test_score", "cv_result"])

In [None]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier()
split_count=int(df.shape[0]*0.8)
train=df[:split_count].copy()

test=df[split_count:].copy()
x_train=train[feature_list]
y_train=train['DEFAULT']
print(y_train.shape)
model.fit(x_train,y_train)

from sklearn.tree import plot_tree
plt.figure(figsize=(20,20))
sns.barplot(x=model.feature_importances_,y=feature_list)


In [None]:
plt.figure(figsize=(20,20))
sns.countplot(data=df,x='R_DEBT_INCOME',hue='DEFAULT')

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(20,20))
tree=plot_tree(model,feature_names=feature_list,filled=True,fontsize=10)

In [None]:

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

# Principal Component Analysis (PCA)
pca = PCA(n_components=84)
X_pca = pca.fit_transform(X_train)

print("\nSelected features using PCA:")
print(X_pca)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
max_depth=np.random.randint(2,20,10)
max_features=np.random.uniform(0.3,1.0,10)
param_distributions={'max_depth':max_depth,'max_features':max_features}
result=[]
for model in models:
  if model.__class__.__name__ != 'DecisionTreeClassifier':
        param_distributions["n_estimators"] = np.random.randint(100, 1000, 10)

  clf=RandomizedSearchCV(model,
                    param_distributions,
                    n_iter=50,
                    scoring='accuracy',
                    n_jobs=-1,
                    cv=5,
                    verbose=2)
  clf.fit(X_pca,y_train)
  result.append([model.__class__.__name__,clf.best_params_,
                 clf.best_score_,clf.score(X_test, y_test),clf.cv_results_])

pd.DataFrame(result, columns=["model", "best_params", "train_score", "test_score", "cv_result"])

### 미션 5) 모델 평가 및 선택

GradientBoostingClassifier	accuracy 0.73
피처 엔지니어링에 좀더 시간을 할애해주지 못한 점이 아쉽습니다..시간도 시간이지만 어떤 방식으로 변수를 조작해줄지에 대한 고민을 많이 못했습니다.

데이터 처리를 통해 모델의 성능 확인 비교에 필요한 기준 지표를 확보하기위해
먼저 raw data셋에 decision tree 모델에 학습시키기 위해 최소로 전처리 된 상태에서 학습을 진행하였으나 데이터셋의 변수 종류가 많고 하이퍼 파라미터 세팅만 1000iter로 늘려서 Gridsearch로 코랩의 한정된 자원으로만 학습하다보니 많은 시간이 소요되었습니다.

----

---- 

## 📌 주제 3 - 

### 미션 1) 데이터셋 탐색 및 통계, 분포, 결측치 등 확인

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum().plot.barh(figsize=(10, 9))
#Name, Monthly_Inhand_Salary, Type_of_Loan, Num_of_Delayed_Payment, Num_Credit_Inquiries, Credit_History_Age, Amount_invested_monthly, Monthly_Balance가 결측치 존재

In [None]:
df[["Name","Monthly_Inhand_Salary","Type_of_Loan","Num_of_Delayed_Payment","Num_Credit_Inquiries","Credit_History_Age","Amount_invested_monthly","Monthly_Balance"]]

In [None]:
df["Num_of_Delayed_Payment"] = pd.to_numeric(df["Num_of_Delayed_Payment"], errors="coerce")
#"Num_of_Delayed_Payment"수치형 전환

In [None]:
#결측치 처리
df.groupby(["Credit_Score"])["Monthly_Inhand_Salary","Num_of_Delayed_Payment","Num_Credit_Inquiries"].agg(["mean", "median"])
#각 항목의 결측치를 평균값 or 중앙값으로 대체하기 위해 평균값, 중앙값 계산

In [None]:
df.loc[(df["Credit_Score"] == "Good") & (df["Monthly_Inhand_Salary"].isnull()), "Monthly_Inhand_Salary"] = 5389.246647
df.loc[(df["Credit_Score"] == "Poor") & (df["Monthly_Inhand_Salary"].isnull()), "Monthly_Inhand_Salary"] = 3375.311674
df.loc[(df["Credit_Score"] == "Standard") & (df["Monthly_Inhand_Salary"].isnull()), "Monthly_Inhand_Salary"] = 4242.037051

In [None]:
df.loc[(df["Credit_Score"] == "Good") & (df["Num_of_Delayed_Payment"].isnull()), "Num_of_Delayed_Payment"] = 26.305002
df.loc[(df["Credit_Score"] == "Poor") & (df["Num_of_Delayed_Payment"].isnull()), "Num_of_Delayed_Payment"] = 32.032983
df.loc[(df["Credit_Score"] == "Standard") & (df["Num_of_Delayed_Payment"].isnull()), "Num_of_Delayed_Payment"] = 32.070773

In [None]:
df.loc[(df["Credit_Score"] == "Good") & (df["Num_Credit_Inquiries"].isnull()), "Num_Credit_Inquiries"] = 24.483045
df.loc[(df["Credit_Score"] == "Poor") & (df["Num_Credit_Inquiries"].isnull()), "Num_Credit_Inquiries"] = 30.794915
df.loc[(df["Credit_Score"] == "Standard") & (df["Num_Credit_Inquiries"].isnull()), "Num_Credit_Inquiries"] = 27.192685

In [None]:
df.describe()

In [None]:
df.describe(include="object")

In [None]:
feature_columns = df.columns[0:-1].tolist()
feature_columns

In [None]:
df["Credit_Score"].value_counts()

In [None]:
h = df.hist(figsize=(12, 12))

In [None]:
#상관계수 확인
df_corr.style.background_gradient()

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(df_corr, annot=True, vmax=1, vmin=-1, cmap="coolwarm")

### 미션 2) 모델 선택

In [None]:
df.columns

In [None]:
X = df[['Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Amount_invested_monthly', 'Monthly_Balance']]
X.shape

In [None]:
y = df['Credit_Score']
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce')

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=11, random_state=42)
model

### 미션 3) 모델 훈련

### 미션 4) 하이퍼파라미터 튜닝

### 미션 5) 모델 평가 및 선택

-----

----- 