2023-1 ML 교육세션 실습 자료를 참고했습니다.

In [1]:
# 데이터셋 불러오기
# spaceship_tt.csv 데이터프레임을 이용해, "Transported"값을 분류 대상으로 삼는 decision tree classifier 만들기
import numpy as np
import pandas as pd
df = pd.read_csv('spaceship_tt.csv')

In [2]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

PassengerId : 승객 ID

HomePlanet : 출발 행성(거주지)

CryoSleep : 취침 방식 여부

Cabin : 객실 종류 및 번호 (port : 좌현, starboard : 우현)

Destination : 목적지

Age : 승객의 나이

VIP : 승객의 VIP 서비스 유무

RoomService, FoodCourt, ShoppingMall, Spa, VRDeck : 승객이 해당 서비스에 대해 지불한 금액

Name : 이름

Transported : 도착 여부

In [5]:
# 모델 라이브러리 import
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [6]:
# base dataframe
df_base=df[['CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported']]

In [7]:
df_base.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_base.dropna(inplace=True)


In [8]:
df_base

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...
8688,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,True,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [9]:
# Model build
dt_clf = DecisionTreeClassifier(random_state=111)

# Train/test split 진행 (7:3)
X_train, X_test, y_train, y_test = train_test_split(df_base.drop('Transported',axis=1),
                                                    df_base['Transported'], # target variable = Transported
                                                    test_size=0.3, random_state=111)
# 모델 훈련
dt_clf = dt_clf.fit(X_train,y_train)

In [10]:
X_train

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
5664,False,10.0,False,0.0,0.0,0.0,0.0,0.0
6303,False,28.0,False,32.0,0.0,304.0,291.0,2.0
963,False,0.0,False,0.0,0.0,0.0,0.0,0.0
6375,False,21.0,False,1244.0,0.0,1119.0,0.0,3.0
8299,False,34.0,False,0.0,3126.0,0.0,112.0,15.0
...,...,...,...,...,...,...,...,...
5017,True,38.0,False,0.0,0.0,0.0,0.0,0.0
5787,False,33.0,True,0.0,3095.0,0.0,197.0,40.0
2429,True,34.0,False,0.0,0.0,0.0,0.0,0.0
8421,False,39.0,False,2101.0,41.0,0.0,525.0,19.0


In [11]:
# 평가지표 출력
from sklearn.metrics import accuracy_score
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print(accuracy)

0.7601102941176471


# 다양한 모델을 통한 평가

In [13]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# 랜덤 포레스트, GBM, XGBoost, LightGBM model 별로 평가 수행
# n_estimators를 변경 가능
# 모델별로 추가 parameter 튜닝 가능
rf_reg = RandomForestRegressor(n_estimators=200)
gbm_reg = GradientBoostingRegressor(n_estimators=200)
xgb_reg = XGBRegressor(n_estimators=200)
lgbm_reg = LGBMRegressor(n_estimators=200)

In [14]:
model = rf_reg  # random forest


# 모델과 학습/테스트 데이터 셋을 입력하면 성능 평가 수치를 반환하는 함수 정의
from sklearn.metrics import mean_squared_error # regressor를 위한 평가 지표
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1:
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)

    mse = mean_squared_error(y_test, pred)

    print('###', model.__class__.__name__, '###')
    print('Mean Squared Error:', mse)


In [15]:
get_model_predict(model,X_train, X_test, y_train, y_test, is_expm1=True)

### RandomForestRegressor ###
Mean Squared Error: 0.46792006260498314


In [16]:
# 모델과 학습/테스트 데이터 셋을 입력하면 성능 평가 수치를 반환
def get_two_models_predict(model1, model2, X_train, X_test, y_train, y_test, is_expm1=False):
    model1.fit(X_train, y_train)
    model1_pred = model1.predict(X_test)
    model2.fit(X_train, y_train)
    model2_pred = model2.predict(X_test)

    pred = 0.4*model1_pred + 0.6*model2_pred # 각 모델의 반영 가중치 설정

    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    mse = mean_squared_error(y_test, pred)

    print('### Two Models ###')
    print('Mean Squared Error:', mse)

In [17]:
model1 = rf_reg
model2 = gbm_reg  # GBM Model (Boosting)

get_two_models_predict(model1, model2, X_train, X_test, y_train, y_test, is_expm1=True)


### Two Models ###
Mean Squared Error: 0.45119427362026243


## 두 과제를 "ML_과제_(이름).ipynb" 의 파일명으로 제출해 주세요!


### 과제1
비지도 학습의 목적인 Grouping에 대해 자세히 설명해 주세요! Dataset을 Grouping한다는 것이 무슨 의미인지 간단히 서술해 주시고,

세션에서 언급되었던 Grouping을 위한 두 수단에 대해서 예시 모델을 제시하여 설명해 주시면 됩니다! 예시 모델의 종류에 대해서는 구글링을 통해 조사할 수 있겠습니다.


Grouping is the prime feature of unsupervised learning, discovering hidden patterns in unlabelled datasets by identifying similarities between unlabelled datapoints and tying them up into a group. This grouping is an essential step to dividing a humongous piece of unlabelled dataset into digestible chunks for a machine learning system. There are two main grouping methods by which data is reduced: clustering and dimensionality reduction. In simple terms, clustering reduces the number of examples and dimensionality reduction reduces the number of dimensions of a dataset for machine learning. 

Given a set of examples, clustering divides this set into subsets of examples based on their similarities. K-means clustering is a key similarity measurement technique where random unlabelled samples are chosen as initial means and the K-means algorithm will iterate over certain calculations to create centroids until their convergence. The algorithm will then identify the k number of centroids to allocate datapoints to the nearest cluster without unnecessarily raising the number of centroids. In this way, the K-means clustering algorithm groups datapoints with their minimum variation from the cluster's centroid. K-means clustering works particularly well for cases where clusters are spherical, sufficiently separated, and have similar volumes as well as similar numbers of points. 

Dimensionality reduction, however, convert the dimension of a set of datapoints from 'd' to 'r' given r < d with summary indices, that retain the original information. The new values created under a smaller set of these summary indices are called principal components. These components are variables that are branches of the original variables, newly transformed through linear combinations. Principal Component Analysis, often abbreviated as PCA, executes this dimensionality reduction by finding an r-dimension projection that best preserves the variance between datapoints. The first step of PCA is to compute the mean vector µ and covariance matrix Σ of original datapoints. Then the eigenvectors and eigenvalues of Σ will be computed. The top r eigenvectors will be selected, and the points will be projected onto the subspace spanned by these eigenvectors: y=A(x-µ) where y is the new point, x the original point, and rows of A represent the eigenvectors.


References: 
1) https://statisticsbyjim.com/basics/k-means-clustering/
2) https://statisticsbyjim.com/basics/principal-component-analysis/

### 과제2
드라이브에 첨부된 diabetes.csv 파일을 이용해, decision tree를 이용한 classifier model을 디자인해 주세요!

분류 문제에 맞는 (간단한 수준의) 데이터 전처리 및 Feature selection이 선행되어야 합니다.

모델의 성능은 상관 없지만, 모델 구동 결과 및 평가지표의 출력은 정상적으로 이루어져야 합니다! 평가지표의 종류 또한 상관 없습니다.

데이터셋에 대한 설명은 아래 캐클 링크를 참조해 주세요!
https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

과제 진행에 있어 어려움을 겪으신다면 아래 링크의 내용을 참조하시면 됩니다!
https://www.datacamp.com/tutorial/decision-tree-classification-python


In [87]:
# Target variable: Outcome
# Predictor variables include the number of pregnancies a patient has had and one's BMI, insulin level, age, and so on.
# Goal: Building a machine learning model that predicts the diabetes outcome of a patient based on their given health information

import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


col_names = ['pregnant', 'glucose',
             'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

data = pd.read_csv("diabetes.csv", header=None, names=col_names, skiprows=1)
data.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [88]:
data.dropna(inplace=True)

In [89]:
data.isna().sum()

pregnant    0
glucose     0
bp          0
skin        0
insulin     0
bmi         0
pedigree    0
age         0
label       0
dtype: int64

In [90]:
feature_cols = ['pregnant', 'glucose','bp','skin','insulin','bmi','pedigree','age']
X = data[feature_cols]
Y = data.label

In [91]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=8) # 70% for training and 30 for testing

In [92]:
classifier_object = DecisionTreeClassifier()
classifier_object.fit(X_train, Y_test)
Y_pred = classifier_object.predict(X_test)

In [93]:
print(f"Accuracy:{metrics.accuracy_score(Y_test, Y_pred)}")

Accuracy:0.7142857142857143


In [94]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

rf_reg = RandomForestRegressor(n_estimators=20)
gbm_reg = GradientBoostingRegressor(n_estimators=20)
xgb_reg = XGBRegressor(n_estimators=20)
lgbm_reg = LGBMRegressor(n_estimators=20)

In [95]:
model = rf_reg  # Random forest

from sklearn.metrics import mean_squared_error # mean_squared_error for regressor
def get_model_predict(model, X_train, X_test, y_train, y_test, is_expm1=False):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    if is_expm1:
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)

    mse = mean_squared_error(y_test, pred)

    print('###', model.__class__.__name__, '###')
    print('Mean Squared Error:', mse)


In [98]:
get_model_predict(model,X_train, X_test, Y_train, Y_test, is_expm1=True)

### RandomForestRegressor ###
Mean Squared Error: 0.5099344425013486


In [101]:
def get_two_models_predict(model1, model2, X_train, X_test, y_train, y_test, is_expm1=False):
    model1.fit(X_train, y_train)
    model1_pred = model1.predict(X_test)
    model2.fit(X_train, y_train)
    model2_pred = model2.predict(X_test)

    pred = 0.5*model1_pred + 0.5*model2_pred 

    if is_expm1 :
        y_test = np.expm1(y_test)
        pred = np.expm1(pred)
    mse = mean_squared_error(y_test, pred)

    print('### Two Models ###')
    print('Mean Squared Error:', mse)

In [102]:
model1 = rf_reg
model2 = gbm_reg  # GBM Model (Boosting)

get_two_models_predict(model1, model2, X_train, X_test, Y_train, Y_test, is_expm1=True)

### Two Models ###
Mean Squared Error: 0.4870814595437499
