In [46]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import missingno as msno    # 누락값 표시
import warnings
warnings.filterwarnings('ignore')  # 워닝 무시

# 새로운 창 안 뜨고 노트북에 바로 뜸
%matplotlib inline 

In [47]:
from sklearn.datasets import load_iris

iris = load_iris()

iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [48]:
from sklearn.preprocessing import MinMaxScaler

iris_data = iris.data
iris_col = iris.feature_names # sepal : 꽃받침, petal : 꽃잎 
iris_target = iris.target

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(iris_data)

In [49]:
from sklearn.preprocessing import MinMaxScaler

df_unnormal = pd.DataFrame(data = iris_data, columns = iris_col)

df_normal = pd.DataFrame(data = normalized_data, columns = iris_col)

df_unnormal.describe()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [51]:
# 정규화 데이터, 비정규화 데이터 예측 점수 비교

X_train1, X_test1, y_train1, y_test1 = train_test_split(df_unnormal, iris_target, random_state=0)
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_normal, iris_target, random_state=0)

clf_unnormal =  KNeighborsClassifier(n_neighbors=3).fit(X_train1, y_train1)
clf_normal =  KNeighborsClassifier(n_neighbors=3).fit(X_train2, y_train2)

print('Train data unnormalized: {:.3f}'.format(clf_unnormal.score(X_train1, y_train1)))
print('Test data unnormalized: {:.3f} \n'.format(clf_unnormal.score(X_test1, y_test1)))

print('Train data normalized: {:.3f}'.format(clf_normal.score(X_train2, y_train2)))
print('Test data normalized: {:.3f}'.format(clf_normal.score(X_test2, y_test2)))

Train data unnormalized: 0.964
Test data unnormalized: 0.974 

Train data normalized: 0.973
Test data normalized: 0.974


In [52]:
# GridSearchCV로 LogisticRegression 모델의 최적의 파라미터 찾아보기

from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2']
}

# Perform grid search
grid_search = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5) # cv = 5 => 5-fold
grid_search.fit(df_normal, iris_target)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score: {:.3f}".format(best_score))


Best Parameters: {'C': 10.0, 'penalty': 'l2'}
Best Score: 0.953


In [53]:
X_train, X_test, y_train, y_test = train_test_split(df_normal, iris_target, random_state=0)

prev_logreg = LogisticRegression().fit(X_train, y_train)

print('Train score with default params : {:.3f}'.format(prev_logreg.score(X_train, y_train)))
print('Train score with default params : {:.3f} \n'.format(prev_logreg.score(X_test, y_test)))

grid_logreg = LogisticRegression(C = 10.0, penalty = 'l2')
grid_logreg.fit(X_train, y_train)

print('Test score with searched params :{:.3f}'.format(grid_logreg.score(X_train, y_train)))
print('Test score with searched params :{:.3f}'.format(grid_logreg.score(X_test, y_test)))

Train score with default params : 0.929
Train score with default params : 0.895 

Test score with searched params :0.964
Test score with searched params :0.974


In [54]:
X_train, X_test, y_train, y_test = train_test_split(df_normal, iris_target, random_state=0)

deTree = DecisionTreeClassifier().fit(X_train, y_train)

print('{:.3f}'.format(deTree.score(X_train, y_train)))
print('{:.3f}'.format(deTree.score(X_test, y_test)))

1.000
0.974


In [55]:
X_train, X_test, y_train, y_test = train_test_split(df_normal, iris_target, random_state=0)

forest = RandomForestClassifier().fit(X_train, y_train)

print('{:.3f}'.format(forest.score(X_train, y_train)))
print('{:.3f}'.format(forest.score(X_test, y_test)))

1.000
0.974


In [56]:
# GridSearchCV로 RandomForestClassifier 모델의 최적의 파라미터 찾아보기

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=10)
grid_search.fit(df_normal, iris_target)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:{:.3f}".format(best_score))


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 10}
Best Score:0.973


In [57]:
X_train, X_test, y_train, y_test = train_test_split(df_normal, iris_target, random_state=0)

prev_forest = RandomForestClassifier().fit(X_train, y_train)

print('Train score with default params : {:.3f}'.format(prev_forest.score(X_train, y_train)))
print('Train score with default params : {:.3f} \n'.format(prev_forest.score(X_test, y_test)))

grid_forest = RandomForestClassifier(max_depth = 5, min_samples_leaf =  1, min_samples_split =  5, n_estimators =  10)
grid_forest.fit(X_train, y_train)

print('Test score with searched params :{:.3f}'.format(grid_forest.score(X_train, y_train)))
print('Test score with searched params :{:.3f}'.format(grid_forest.score(X_test, y_test)))

Train score with default params : 1.000
Train score with default params : 0.974 

Test score with searched params :0.982
Test score with searched params :0.974
