##Data Load

In [None]:
!pip install ucimlrepo



In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
yeast = fetch_ucirepo(id=110)

# data (as pandas dataframes)
X = yeast.data.features
y = yeast.data.targets

# metadata
print(yeast.metadata)

# variable information
print(yeast.variables)

{'uci_id': 110, 'name': 'Yeast', 'repository_url': 'https://archive.ics.uci.edu/dataset/110/yeast', 'data_url': 'https://archive.ics.uci.edu/static/public/110/data.csv', 'abstract': 'Predicting the Cellular Localization Sites of Proteins', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1484, 'num_features': 8, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['localization_site'], 'index_col': ['Sequence_Name'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1991, 'last_updated': 'Sat Oct 28 2023', 'dataset_doi': '10.24432/C5KG68', 'creators': ['Kenta Nakai'], 'intro_paper': None, 'additional_info': {'summary': 'Predicted Attribute: Localization site of protein. ( non-numeric ).\r\n\r\nThe references below describe a predecessor to this dataset and its development. They also give results (not cross-validated) for classification by a rule-based expert system with that version of th

In [None]:
X.head()
len(X)

1484

In [None]:
#y['localization_site'].unique()
len(y)

1484

##Data preprocessing

###결측치 및 범주형 데이터 처리

In [None]:
import pandas as pd
import numpy as np

In [None]:
X.isnull().sum()

Unnamed: 0,0
mcg,0
gvh,0
alm,0
mit,0
erl,0
pox,0
vac,0
nuc,0


In [None]:
y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1484 entries, 0 to 1483
Data columns (total 1 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   localization_site  1484 non-null   object
dtypes: object(1)
memory usage: 11.7+ KB


In [None]:
y.isnull().sum()
#결측치 없음

Unnamed: 0,0
localization_site,0


In [None]:
from sklearn.preprocessing import LabelEncoder

# 목표 변수 y 데이터 (예시)
y = y.iloc[:, -1]  # 목표 변수가 마지막 열에 있다고 가정

# 라벨 인코더 생성
label_encoder = LabelEncoder()

# y 데이터를 라벨 인코딩
y = label_encoder.fit_transform(y)

# 라벨 인코딩된 데이터를 DataFrame으로 저장
y = pd.DataFrame(y, columns=['localization_site'])

In [None]:
y['localization_site'].unique()

array([6, 7, 0, 3, 2, 4, 5, 9, 8, 1])

In [None]:
y.head()

Unnamed: 0,localization_site
0,6
1,6
2,6
3,7
4,6


###스케일링

In [None]:
X.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22
3,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22
4,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22


In [None]:
# StandardScaler 사용
from sklearn.preprocessing import StandardScaler
X_data = X

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_data)

X_SS = pd.DataFrame(X_scaled, columns=X_data.columns)

X_SS.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,0.581981,0.888481,-0.346645,-0.957203,-0.09759,-0.099131,-0.344175,-0.527919
1,-0.510891,1.372811,-0.231226,0.064312,-0.09759,-0.099131,0.521219,-0.527919
2,1.01913,0.969203,-0.115808,-0.811272,-0.09759,-0.099131,0.521219,-0.527919
3,0.581981,-0.483786,0.807542,-0.957203,-0.09759,-0.099131,0.694298,-0.527919
4,-0.583749,-0.483786,-0.231226,2.034375,-0.09759,-0.099131,-0.344175,-0.527919


In [None]:
#MinMax Scaler 사용
from sklearn.preprocessing import MinMaxScaler

X_data = X

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_data)

X_MN = pd.DataFrame(X_scaled, columns=X_data.columns)

X_MN.head()

Unnamed: 0,mcg,gvh,alm,mit,erl,pox,vac,nuc
0,0.52809,0.551724,0.329114,0.13,0.0,0.0,0.657534,0.22
1,0.359551,0.62069,0.341772,0.27,0.0,0.0,0.726027,0.22
2,0.595506,0.563218,0.35443,0.15,0.0,0.0,0.726027,0.22
3,0.52809,0.356322,0.455696,0.13,0.0,0.0,0.739726,0.22
4,0.348315,0.356322,0.341772,0.54,0.0,0.0,0.657534,0.22


##Data Split

In [None]:
from sklearn.model_selection import train_test_split

X_SS_train, X_SS_test, y_SS_train, y_SS_test = train_test_split(X_SS, y, test_size=0.2, random_state=42)
X_MN_train, X_MN_test, y_MN_train, y_MN_test = train_test_split(X_MN, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##Training

###Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rf_param_grid = {
    'n_estimators': [100],  # 트리의 수 고정
    'max_depth': [10, 20, 30, 40, 50],  # 트리의 최대 깊이
    'min_samples_split': [2, 5, 10, 15],  # 내부 노드를 분할하기 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4, 6],  # 리프 노드에 있어야 할 최소 샘플 수
    'max_features': ['auto', 'sqrt', 'log2']  # 각 트리에서 고려할 최대 특성 수
}

def train_and_evaluate_model_RF(X_train, y_train, X_test, y_test, param_grid):
    rf_model = RandomForestRegressor(random_state=42)

    rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    rf_grid_search.fit(X_train, y_train)

    print(f'최적 하이퍼파라미터: {rf_grid_search.best_params_}')

    rf_best_model = rf_grid_search.best_estimator_
    rf_y_pred = rf_best_model.predict(X_test)

    mae = mean_absolute_error(y_test, rf_y_pred)
    mse = mean_squared_error(y_test, rf_y_pred)
    r2 = r2_score(y_test, rf_y_pred)

    return mae, mse, r2

# 1. SS
print("SS 스케일링 방식:")
mae_SS, mse_SS, r2_SS = train_and_evaluate_model_RF(X_SS_train, y_SS_train, X_SS_test, y_SS_test, rf_param_grid)
print(f'MAE: {mae_SS}, MSE: {mse_SS}, R^2: {r2_SS}\n')

# 2. MN
print("MN 스케일링 방식:")
mae_MN, mse_MN, r2_MN = train_and_evaluate_model_RF(X_MN_train, y_MN_train, X_MN_test, y_MN_test, rf_param_grid)
print(f'MAE: {mae_MN}, MSE: {mse_MN}, R^2: {r2_MN}\n')

# 3. 원본 X
print("원본 X 데이터 (스케일링 없이):")
mae_X, mse_X, r2_X = train_and_evaluate_model_RF(X_train, y_train, X_test, y_test, rf_param_grid)
print(f'MAE: {mae_X}, MSE: {mse_X}, R^2: {r2_X}')

SS 스케일링 방식:
Fitting 5 folds for each of 240 candidates, totalling 1200 fits


400 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
155 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

최적 하이퍼파라미터: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
MAE: 2.3565339678696726, MSE: 8.02322159585802, R^2: 0.13763732124609473

MN 스케일링 방식:
Fitting 5 folds for each of 240 candidates, totalling 1200 fits


400 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
227 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

최적 하이퍼파라미터: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
MAE: 2.3561957319884543, MSE: 8.032521352523126, R^2: 0.1366377523106409

원본 X 데이터 (스케일링 없이):
Fitting 5 folds for each of 240 candidates, totalling 1200 fits


400 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
258 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
s

최적 하이퍼파라미터: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
MAE: 2.3574364109185804, MSE: 8.042374172613398, R^2: 0.1355787382693595


###Ridge Regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 하이퍼파라미터 그리드
ridge_param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]  # 정규화 강도
}

def train_and_evaluate_model(X_train, y_train, X_test, y_test, param_grid):
    ridge_model = Ridge(random_state=42)

    ridge_grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    ridge_grid_search.fit(X_train, y_train)

    print(f'최적 하이퍼파라미터: {ridge_grid_search.best_params_}')

    ridge_best_model = ridge_grid_search.best_estimator_
    ridge_y_pred = ridge_best_model.predict(X_test)

    mae = mean_absolute_error(y_test, ridge_y_pred)
    mse = mean_squared_error(y_test, ridge_y_pred)
    r2 = r2_score(y_test, ridge_y_pred)

    return mae, mse, r2

# 1. SS
print("SS 스케일링 방식:")
mae_SS, mse_SS, r2_SS = train_and_evaluate_model(X_SS_train, y_SS_train, X_SS_test, y_SS_test, ridge_param_grid)
print(f'MAE: {mae_SS}, MSE: {mse_SS}, R^2: {r2_SS}\n')

# 2. MN
print("MN 스케일링 방식:")
mae_MN, mse_MN, r2_MN = train_and_evaluate_model(X_MN_train, y_MN_train, X_MN_test, y_MN_test, ridge_param_grid)
print(f'MAE: {mae_MN}, MSE: {mse_MN}, R^2: {r2_MN}\n')

# 3. 원본 X
print("원본 X 데이터 (스케일링 없이):")
mae_X, mse_X, r2_X = train_and_evaluate_model(X_train, y_train, X_test, y_test, ridge_param_grid)
print(f'MAE: {mae_X}, MSE: {mse_X}, R^2: {r2_X}')

SS 스케일링 방식:
Fitting 5 folds for each of 6 candidates, totalling 30 fits
최적 하이퍼파라미터: {'alpha': 100.0}
MAE: 2.5734441724591277, MSE: 8.536257436567238, R^2: 0.08249451400776997

MN 스케일링 방식:
Fitting 5 folds for each of 6 candidates, totalling 30 fits
최적 하이퍼파라미터: {'alpha': 1.0}
MAE: 2.5706196347757584, MSE: 8.529778131588099, R^2: 0.08319093136724531

원본 X 데이터 (스케일링 없이):
Fitting 5 folds for each of 6 candidates, totalling 30 fits
최적 하이퍼파라미터: {'alpha': 1.0}
MAE: 2.5731044296602144, MSE: 8.534681546692987, R^2: 0.0826638959196545


###XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

xgb_param_grid = {
    'n_estimators': [100],  # 트리의 개수
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.5],  # 학습률
    'max_depth': [1, 3, 6, 10, 15],  # 트리의 최대 깊이
    'subsample': [0.4, 0.6, 0.8, 1.0],  # 샘플링 비율
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0]  # 각 트리의 특성 샘플링 비율
}

def train_and_evaluate_model_XGBoost(X_train, y_train, X_test, y_test, param_grid):
    xgb_model = xgb.XGBRegressor(random_state=42)

    xgb_grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
    xgb_grid_search.fit(X_train, y_train)

    print(f'최적 하이퍼파라미터: {xgb_grid_search.best_params_}')

    xgb_best_model = xgb_grid_search.best_estimator_
    xgb_y_pred = xgb_best_model.predict(X_test)

    mae = mean_absolute_error(y_test, xgb_y_pred)
    mse = mean_squared_error(y_test, xgb_y_pred)
    r2 = r2_score(y_test, xgb_y_pred)

    return mae, mse, r2

# 1. SS
print("SS 스케일링 방식:")
mae_SS, mse_SS, r2_SS = train_and_evaluate_model_XGBoost(X_SS_train, y_SS_train, X_SS_test, y_SS_test, xgb_param_grid)
print(f'MAE: {mae_SS}, MSE: {mse_SS}, R^2: {r2_SS}\n')

# 2. MN
print("MN 스케일링 방식:")
mae_MN, mse_MN, r2_MN = train_and_evaluate_model_XGBoost(X_MN_train, y_MN_train, X_MN_test, y_MN_test, xgb_param_grid)
print(f'MAE: {mae_MN}, MSE: {mse_MN}, R^2: {r2_MN}\n')

# 3. 원본 X
print("원본 X 데이터 (스케일링 없이):")
mae_X, mse_X, r2_X = train_and_evaluate_model_XGBoost(X_train, y_train, X_test, y_test, xgb_param_grid)
print(f'MAE: {mae_X}, MSE: {mse_X}, R^2: {r2_X}')

SS 스케일링 방식:
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
최적 하이퍼파라미터: {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
MAE: 2.46169376373291, MSE: 8.241408348083496, R^2: 0.1141858696937561

MN 스케일링 방식:
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
최적 하이퍼파라미터: {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
MAE: 2.46169376373291, MSE: 8.241408348083496, R^2: 0.1141858696937561

원본 X 데이터 (스케일링 없이):
Fitting 5 folds for each of 400 candidates, totalling 2000 fits
최적 하이퍼파라미터: {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
MAE: 2.46169376373291, MSE: 8.241408348083496, R^2: 0.1141858696937561
