<a href="https://colab.research.google.com/github/Lee-Minsoo-97/Machine-Learning/blob/main/Hyperparameter_tuning_and_Stacking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#imports


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
import joblib  # for saving models if needed
import os      # for handling file paths


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# data import

path_test = '/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/test.csv'
path_train = '/content/drive/MyDrive/Colab Notebooks/CIS508_Machine_Learning/Individual_Assignment/train.csv'

df_test = pd.read_csv(path_test)
df_train = pd.read_csv(path_train)

In [None]:
df_train.columns

Index(['Id', 'Open Date', 'City', 'City Group', 'Type', 'P1', 'P2', 'P3', 'P4',
       'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11', 'P12', 'P13', 'P14', 'P15',
       'P16', 'P17', 'P18', 'P19', 'P20', 'P21', 'P22', 'P23', 'P24', 'P25',
       'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35',
       'P36', 'P37', 'revenue'],
      dtype='object')

In [None]:
df_test.shape

(100000, 42)

In [None]:
# check if there are NAs

#df_train.isna().sum()
#df_test.isna().sum()

In [None]:
# Drop 'Id' column
df_train.drop('Id', axis=1, inplace=True)
df_test.drop('Id', axis=1, inplace=True)

# Convert 'Open Date' to 'Restaurant Age'
from datetime import datetime
current_date = datetime.now()
df_train['Restaurant Age'] = (current_date - pd.to_datetime(df_train['Open Date'])).dt.days
df_test['Restaurant Age'] = (current_date - pd.to_datetime(df_test['Open Date'])).dt.days

# Drop 'Open Date' as it's no longer needed
df_train.drop('Open Date', axis=1, inplace=True)
df_test.drop('Open Date', axis=1, inplace=True)

In [None]:
df_train.head()

Unnamed: 0,City,City Group,Type,P1,P2,P3,P4,P5,P6,P7,...,P30,P31,P32,P33,P34,P35,P36,P37,revenue,Restaurant Age
0,İstanbul,Big Cities,IL,4,5.0,4.0,4.0,2,2,5,...,5,3,4,5,5,4,3,4,5653753.0,9261
1,Ankara,Big Cities,FC,4,5.0,4.0,4.0,1,2,5,...,0,0,0,0,0,0,0,0,6923131.0,6127
2,Diyarbakır,Other,IL,2,4.0,2.0,5.0,2,3,5,...,0,0,0,0,0,0,0,0,2055379.0,4277
3,Tokat,Other,IL,6,4.5,6.0,6.0,4,4,10,...,25,12,10,6,18,12,12,6,2675511.0,4678
4,Gaziantep,Other,IL,3,4.0,3.0,4.0,2,2,5,...,5,1,3,2,3,4,3,3,4316715.0,5677


In [None]:
# Encode categorical variables
categorical_cols = ['City', 'City Group', 'Type']
df_train = pd.get_dummies(df_train, columns=categorical_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

In [None]:
# Ensure same columns in test and train (in case encoding causes mismatch)
df_test = df_test.reindex(columns=df_train.columns.drop('revenue'), fill_value=0)

In [None]:
# Split into features and target
X_train = df_train.drop('revenue', axis=1)
y_train = df_train['revenue']
X_test = df_test


In [None]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Regressor

In [None]:
# Hyperparameter grid
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize model
rf = RandomForestRegressor(random_state=42)

# Grid search with cross-validation
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
rf_grid.fit(X_train_scaled, y_train)

# Best model
rf_best = rf_grid.best_estimator_
print(f"Best Random Forest Parameters: {rf_grid.best_params_}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# Make a submissoin file for score result
rf_predictions = rf_best.predict(X_test_scaled)

# Manually create the Id column starting from 0
submission = pd.DataFrame({
    'Id': range(0, len(rf_predictions)),  # Create a sequential Id column starting at 0
    'Prediction': rf_predictions         # Use predictions from rf_best
})

# Save the submission file as a CSV
submission.to_csv('rf_submission.csv', index=False)

print("Submission file 'rf_submission.csv' created successfully!")





Submission file 'rf_submission.csv' created successfully!


#Decision Tree Regressor

In [None]:
# Hyperparameter grid
dt_params = {
    'max_depth': [10, 15, 20, 25, 30],
    'min_samples_split': [2, 3, 5, 7],
    'min_samples_leaf': [1, 2, 3, 5],
    'max_features': [None, 'sqrt', 'log2'],
    'min_impurity_decrease': [0.0, 0.001, 0.005, 0.01],
    'ccp_alpha': [0.0, 0.001, 0.005, 0.01]
}

# Initialize model
dt = DecisionTreeRegressor(random_state=42)

# Grid search with cross-validation
dt_grid = GridSearchCV(dt, dt_params, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
dt_grid.fit(X_train_scaled, y_train)

# Best model
dt_best = dt_grid.best_estimator_
print(f"Best Decision Tree Parameters: {dt_grid.best_params_}")


Fitting 5 folds for each of 3840 candidates, totalling 19200 fits
Best Decision Tree Parameters: {'ccp_alpha': 0.0, 'max_depth': 10, 'max_features': 'sqrt', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 5, 'min_samples_split': 2}


In [None]:
# 테스트 세트에 대해 예측값 생성
dt_predictions = dt_best.predict(X_test_scaled)

# 제출 데이터프레임 생성
submission_dt = pd.DataFrame({
    'Id': range(0, len(dt_predictions)),  # Id는 0부터 시작
    'Prediction': dt_predictions          # Decision Tree 예측값
})

# 제출 파일 저장
submission_dt.to_csv('dt_submission.csv', index=False)

print("Submission file 'dt_submission.csv' created successfully!")


Submission file 'dt_submission.csv' created successfully!


# MLP Regressor

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Hyperparameter grid
mlp_params = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['adaptive'],
    'batch_size': ['auto'],
    'learning_rate_init': [0.0001, 0.001],
    'max_iter': [2000]
}

# Initialize model
mlp = MLPRegressor(max_iter=2000, random_state=42)

mlp_random = RandomizedSearchCV(
    estimator=mlp,
    param_distributions=mlp_params,
    n_iter=10,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    random_state=42,
    n_jobs=-1
)


mlp_random.fit(X_train_scaled, y_train)

# 최적 모델 출력
mlp_best = mlp_random.best_estimator_
print(f"Best MLP Parameters: {mlp_random.best_params_}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best MLP Parameters: {'solver': 'adam', 'max_iter': 2000, 'learning_rate_init': 0.001, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (100, 50), 'batch_size': 'auto', 'alpha': 0.0001, 'activation': 'relu'}




In [None]:
# Generate predictions for the test set
mlp_predictions = mlp_best.predict(X_test_scaled) * y_train.max()  # Reverse normalization if applied

# Create submission DataFrame
submission_mlp = pd.DataFrame({
    'Id': range(0, len(mlp_predictions)),
    'Prediction': mlp_predictions
})

# Save to CSV
submission_mlp.to_csv('mlp_submission.csv', index=False)
print("Submission file 'mlp_submission.csv' created successfully!")

Submission file 'mlp_submission.csv' created successfully!


# Stacking

In [None]:
from sklearn.linear_model import LinearRegression

# Generate out-of-sample predictions for stacking
train_preds_rf = rf_best.predict(X_train_scaled)
train_preds_dt = dt_best.predict(X_train_scaled)
train_preds_mlp = mlp_best.predict(X_train_scaled)

test_preds_rf = rf_best.predict(X_test_scaled)
test_preds_dt = dt_best.predict(X_test_scaled)
test_preds_mlp = mlp_best.predict(X_test_scaled)

# Combine predictions into new feature sets
stack_train = np.column_stack((train_preds_rf, train_preds_dt, train_preds_mlp))
stack_test = np.column_stack((test_preds_rf, test_preds_dt, test_preds_mlp))

# Initialize meta-model
meta_model = LinearRegression()

# Train meta-model on stacked features
meta_model.fit(stack_train, y_train)

# Predict on test set
final_predictions = meta_model.predict(stack_test)


In [None]:
# Create submission DataFrame
submission_stacked = pd.DataFrame({
    'Id': range(0, len(final_predictions)),
    'Prediction': final_predictions
})

# Save to CSV
submission_stacked.to_csv('stacked_submission.csv', index=False)

print("Submission file 'stacked_submission.csv' created successfully!")


Submission file 'stacked_submission.csv' created successfully!


# Hyperparameter tuning stacking

In [None]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

# 확장된 하이퍼파라미터 그리드
meta_params = {
    'n_estimators': [100, 200, 300],        # 트리 개수
    'max_depth': [10, 20, None],            # 트리 깊이
    'min_samples_split': [2, 5, 10],        # 분할을 위한 최소 샘플 수
    'min_samples_leaf': [1, 2, 4],          # 리프 노드의 최소 샘플 수
    'max_features': ['sqrt', 'log2', None], # 분할 시 고려할 최대 피처 수
    'bootstrap': [True, False]              # 부트스트랩 샘플링 여부
}

# Random Forest Regressor 초기화
meta_rf = RandomForestRegressor(random_state=42)

# GridSearchCV를 사용한 하이퍼파라미터 튜닝
meta_grid = GridSearchCV(
    estimator=meta_rf,
    param_grid=meta_params,
    cv=10,  # Cross-validation fold 수를 증가
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

# Stacked train 데이터로 학습
meta_grid.fit(stack_train, y_train)

# 최적의 meta-model 출력
meta_best = meta_grid.best_estimator_
print(f"Best Meta-Model Parameters: {meta_grid.best_params_}")

# 테스트 데이터에서 최종 예측 수행
final_predictions = meta_best.predict(stack_test)

# 제출 파일 생성
submission_stacked = pd.DataFrame({
    'Id': range(0, len(final_predictions)),  # 0부터 시작하는 Id 열 생성
    'Prediction': final_predictions
})

# CSV 파일로 저장
submission_stacked.to_csv('stacked_submission.csv', index=False)
print("Tuned Stacked Model submission file 'stacked_submission.csv' created!")



Fitting 10 folds for each of 486 candidates, totalling 4860 fits
Best Meta-Model Parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Tuned Stacked Model submission file 'stacked_submission.csv' created!
