# 사이킷런 라이브러리
25.07.16 오전수업

```
conda create -n hi_ml_env python=3.10
conda activate hi_ml_env
conda config --add channels conda-forge
conda config --set channel_priority strict
conda config --show channels

conda install -c conda-forge numpy pandas scikit-learn=1.4.2 xgboost=1.6.2 lightgbm=3.3.2 imbalanced-learn=0.11 jupyter matplotlib seaborn hyperopt=0.2.7

```

In [None]:
from sklearn.datasets import load_diabetes
load_diabetes = load_diabetes()
load_diabetes.keys()

In [None]:
load_diabetes

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(data = load_diabetes.data, 
                  columns = load_diabetes.feature_names)
df.head()

# 머신러닝 프로세스
1. 데이터 수집
2. 데이터 전처리
3. 데이터 탐색

4. 모델 선택
5. 모델 학습
6. 모델 평가
7. 모델 개선
8. 모델 배포

## 데이터수집

In [None]:
from sklearn.datasets import load_iris
load_iris = load_iris()
iris_data = load_iris.data
iris_data  #X

In [None]:
load_iris.target_names, load_iris.feature_names

In [None]:
iris_label = load_iris.target #y
iris_label

## 데이터분할

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2, random_state=42)


## 모델 생성

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf

## 모델 학습

In [None]:
# 모델 학습
dt_clf.fit(X_train, y_train)

## 모델 예측

In [None]:
pred = dt_clf.predict(X_test)
pred

## 평가

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

# 의사결정 나무 시각화

In [None]:
import graphviz
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(dt_clf, out_file="tree.dot", 
                class_names=load_iris.target_names,)
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

# 교차검증

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
iris_clf = DecisionTreeClassifier(random_state=156)
iris_data, iris_label

In [None]:
for train_index, test_index in kfold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    iris_clf.fit(X_train, y_train)
    pred = iris_clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred)
    print(f"Accuracy: {accuracy:.4f}")

# stratified split

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns=iris.feature_names )
iris_df['label']  = iris.target
iris_df['label'].value_counts()

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)
for train_index, test_index in skf.split(iris_df, iris_df['label']):
  y_train = iris_df['label'].iloc[train_index]
  y_test =  iris_df['label'].iloc[test_index]
  print(f'train 분포: {y_train.value_counts()}')
  print(f'test 분포: {y_test.value_counts()}')

In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(iris_df, iris_df['label']):
  train_data = iris_df.iloc[train_index]
  test_data = iris_df.iloc[test_index]
  
  X_train = train_data[iris.feature_names]
  X_test = test_data[iris.feature_names]
  
  y_train = train_data['label']
  y_test =  test_data['label']
  
  model = DecisionTreeClassifier(random_state=156)
  model.fit(X_train, y_train)
  pred = model.predict(X_test)
  acc = accuracy_score(y_test, pred)
  print(f'acc : {acc}')  

# cross_val_score()의 적용

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

In [None]:
X = iris.data
y = iris.target

model = DecisionTreeClassifier(random_state=156)
cross_val_score(model,X,y,cv=5, scoring='accuracy' )

In [None]:
skf = StratifiedKFold(n_splits=5)
cross_val_score(model,X,y,cv=skf, scoring='accuracy')

In [None]:
results = cross_validate(model, X,y, cv=skf, scoring='accuracy')
results['test_score']

# GridSearchCV - 하이퍼파라미터 + 교차검증

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, 
                                                  iris_label,
                                                  test_size=0.2,
                                                  random_state=121)

In [None]:
from sklearn.model_selection import GridSearchCV
dt_clf2 = DecisionTreeClassifier(random_state=121)
params = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(dt_clf2, param_grid=params, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid_dtree.cv_results_)

In [None]:
b_model = grid_dtree.best_estimator_
pred = b_model.predict(X_test)
accuracy_score(y_test, pred)

In [None]:
grid_dtree.best_params_

# 의사결정 나무 시각화

In [None]:
import graphviz
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(dt_clf, out_file="tree.dot", 
                class_names=load_iris.target_names,)
with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

# 25.07.17(목) 오전수업

# 피처 스케일링

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df.mean()

In [None]:
iris_df.var()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
iris_scaled.mean(), iris_scaled.var()

# LogisticRegression 모델
- 스케일링 안한 데이터
- 스케일링 한 데이터

In [None]:
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(iris.data, 
                                                    iris.target, 
                                                    test_size=0.2, 
                                                    random_state=142 )

In [None]:
no_scaling_model = LogisticRegression()
no_scaling_model.fit(X_train, y_train)
pred_1 = no_scaling_model.predict(X_test)
acc_1 = accuracy_score(y_test, pred_1)
acc_1

In [None]:
from sklearn.preprocessing import StandardScaler
scaler1 = StandardScaler()
scaler1.fit(X_train)
X_train_scaled = scaler1.transform(X_train)
X_test_scaled = scaler1.transform(X_test)

In [None]:
scaling_model = LogisticRegression()
scaling_model.fit(X_train_scaled, y_train)
pred_2 = scaling_model.predict(X_test_scaled)
acc_2 = accuracy_score(y_test, pred_2)
acc_2, acc_1

아이리스 데이터의 경우 이미 스케일링이 잘 정리된 데이터여서
스케일링을 추가로 진행하는 경우, 
데이터의 특성을 줄여 주는 결과가 나오고 있음.


# 타이타닉 titanic dataset 으로 분류 모델 만들기

In [None]:
df = pd.read_csv('./data/titanic.csv')
df.head()

df.info()

X = df[['Pclass', 'SibSp']]
y = df['Survived']

In [None]:
# 훈련, 테스트 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y,                 # 데이터프레임 기준
    test_size=0.2,        # 테스트셋 비율 (20%)
    random_state=42       # 재현 가능성 유지
)


In [None]:
# 모델 생성
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
# 모델 학습
dt_clf.fit(X_train, y_train)
# 모델 예측
pred = dt_clf.predict(X_test)

# 모델 평가
accuracy_score(y_test, pred)

# 타이타닉 생존여부 분류 모델

titanic dataset 으로 분류 모델 만들기

In [None]:
df = pd.read_csv('./data/titanic.csv')
df.head()

In [None]:
df.info()

In [None]:
X = df[['Pclass', 'SibSp']]
y = df['Survived']

# 훈련, 테스트 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y,                 # 데이터프레임 기준
    test_size=0.2,        # 테스트셋 비율 (20%)
    random_state=42       # 재현 가능성 유지
)

In [None]:
# 모델 생성
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
# 모델 학습
dt_clf.fit(X_train, y_train)
# 모델 예측
pred = dt_clf.predict(X_test)

In [None]:
# 모델 평가
accuracy_score(y_test, pred)

# 와인데이터로 스케일링하기

https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-dataset

In [None]:
from sklearn.datasets import load_wine

In [None]:
wine = load_wine()
wine_df = pd.DataFrame(data=wine.data, 
                       columns=wine.feature_names)
wine_df.head(2)

In [None]:
wine_df['target'] = wine.target
wine_df.info()

In [None]:
# target에 들어 있는 값, 분포
wine_df['target'].value_counts().sort_index()

In [None]:
X = wine.data
y = wine.target

In [None]:
# 로지스틱회귀 모델, 8:2, 11

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state=11 )
lr_clf = LogisticRegression(max_iter=3000) #수렴을 위한 하이퍼파라미터
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
accuracy_score(y_test, lr_pred)

# StandardScaling 후 성능

In [None]:
scaler= StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
lr_clf = LogisticRegression(max_iter=3000) #수렴을 위한 하이퍼파라미터
lr_clf.fit(X_train_scaled, y_train)
lr_pred_scaled = lr_clf.predict(X_test_scaled)
accuracy_score(y_test, lr_pred_scaled)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# 알고리즘에 따른 스케일링의 효과 확인

In [None]:
models = {
  'KNN': KNeighborsClassifier(n_neighbors=5),
  'RF': RandomForestClassifier(),
  'LR': LogisticRegression(max_iter=3000)
}
for name, model in models.items():
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  print(f'{name} 모델의 정확도: {acc:.5f}')

# 스케일링 이후 데이터

In [None]:
models = {
  'KNN': KNeighborsClassifier(n_neighbors=5),
  'RF': RandomForestClassifier(),
  'LR': LogisticRegression(max_iter=3000)
}
for name, model in models.items():
  model.fit(X_train_scaled, y_train)
  y_pred = model.predict(X_test_scaled)
  acc = accuracy_score(y_test, y_pred)
  print(f'{name} 모델의 정확도: {acc:.5f}')

In [None]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행 함수
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df) 
    df = drop_features(df)
    df = format_features(df)
    return df