In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor

In [3]:
np.random.seed(42)

# 1. Data processing and Model Defination 

In [4]:
X_train = pd.read_csv("train_X_cat.csv",index_col=0)
Y_train = pd.read_csv("train_y_sales.csv",index_col=0)

In [5]:
Model_1 = KNeighborsRegressor()

# 2. Data Pipelining with GridSearch 

In [6]:
pca=PCA()
RG = Model_1
pipe = Pipeline(steps=[("RG",RG)])

param_grid = {
    
    "RG__n_neighbors":[1,4,5,6,7,8,10]
    
}
score_metric = 'neg_mean_squared_error'
search = GridSearchCV(pipe,param_grid,
                      n_jobs=-1,
                      cv=10,
                      scoring=score_metric)
search.fit(X_train,Y_train)

GridSearchCV(cv=10, estimator=Pipeline(steps=[('RG', KNeighborsRegressor())]),
             n_jobs=-1, param_grid={'RG__n_neighbors': [1, 4, 5, 6, 7, 8, 10]},
             scoring='neg_mean_squared_error')

In [7]:
df = pd.DataFrame(search.cv_results_)
df = df.sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_RG__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
6,0.006704,0.000994,0.011087,0.002204,10,{'RG__n_neighbors': 10},-1070.806582,-372.236456,-1419.13538,-476.048291,-1999.222911,-1823.719177,-687.256242,-2072.327516,-361.609618,-426.097197,-1070.845937,667.882505,1
5,0.006279,0.000712,0.011264,0.001472,8,{'RG__n_neighbors': 8},-1056.140922,-453.571994,-1362.217761,-417.056468,-1818.810621,-1715.65002,-782.902966,-2361.130673,-410.587381,-419.291899,-1079.73607,669.045931,2
2,0.006506,0.000642,0.011102,0.001237,5,{'RG__n_neighbors': 5},-950.005063,-419.850886,-1314.197975,-352.891139,-1524.405823,-1623.160253,-971.01758,-2933.298854,-493.730955,-310.607134,-1089.316566,768.775241,3
3,0.006391,0.000979,0.010905,0.001683,6,{'RG__n_neighbors': 6},-971.448664,-401.724684,-1362.622011,-421.647328,-1794.525316,-1781.458861,-854.983723,-2616.507254,-416.080502,-326.076787,-1094.707513,735.271941,4
4,0.007042,0.002454,0.011228,0.002997,7,{'RG__n_neighbors': 7},-1048.046629,-483.566779,-1435.288943,-445.365022,-1973.082795,-1786.678378,-821.120239,-2320.273365,-409.864812,-407.985441,-1113.12724,683.940345,5
1,0.007601,0.002299,0.010207,0.001326,4,{'RG__n_neighbors': 4},-918.173655,-415.512658,-1316.109573,-298.140823,-1464.999604,-1633.364715,-1119.498806,-3266.542596,-411.047373,-452.388137,-1129.577794,847.853954,6
0,0.00685,0.001828,0.008346,0.001501,1,{'RG__n_neighbors': 1},-1546.012658,-934.367089,-1519.506329,-814.721519,-1858.050633,-1854.196203,-2006.10828,-2610.324841,-772.254777,-622.312102,-1453.785443,617.515846,7


# 3. Best Hyperparameter

In [8]:
parms = pd.DataFrame(df['params'])
for i in parms.iloc[0]:
    print(i)

{'RG__n_neighbors': 10}


# 4. Evaluation of the Model with K-Fold for Original Data


In [9]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE



# Initialize the KNeighborsRegressor with the best number of neighbors
Model_3 = KNeighborsRegressor(n_neighbors=10)

# Prepare the k-fold cross-validation configuration
k = 10  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=132)

# List to store the MSE for each fold
mse_values = []

for train_index, test_index in kf.split(X_train):
    # Split the data into training and validation folds
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    Y_train_fold, Y_val_fold = Y_train.iloc[train_index], Y_train.iloc[test_index]
    
    # Fit the model on the training fold
    Model_3.fit(X_train_fold, Y_train_fold)
    
    # Predict on the validation fold and calculate the MSE
    Y_val_fold_pred = Model_3.predict(X_val_fold)
    mse_fold = MSE(Y_val_fold, Y_val_fold_pred)
    mse_values.append(mse_fold)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_values)
print(f'Mean MSE across all folds: {mean_mse}')



Mean MSE across all folds: 1078.7775556720148


# 5. Evaluation of the Model with K-Fold for Log(Sales) Data



In [11]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE

# Load the training data
Y_train = pd.read_csv("train_y_logsales.csv", index_col=0)
# Assuming X_train is already defined

# Parameters from parms DataFrame
best_n = parms.iloc[0][0]['RG__n_neighbors']

# Initialize the KNeighborsRegressor with the best number of neighbors
Model_2 = KNeighborsRegressor(n_neighbors=best_n)

# Prepare the k-fold cross-validation configuration
k = 10  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=132)

# Lists to store MSE for each fold
mse_values = []
mse_exp_values = []

for train_index, test_index in kf.split(X_train):
    # Split the data into training and validation folds
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    Y_train_fold, Y_val_fold = Y_train.iloc[train_index], Y_train.iloc[test_index]
    
    # Fit the model on the training fold
    Model_2.fit(X_train_fold, Y_train_fold)

    # Predict on the validation fold and calculate MSE
    Y_val_fold_pred = Model_2.predict(X_val_fold)
    mse_fold = MSE(Y_val_fold, Y_val_fold_pred)
    mse_values.append(mse_fold)

    # Calculate MSE on the original sales scale
    result_val = pd.DataFrame(np.exp(Y_val_fold))
    result_val["Predicted_sales"] = np.exp(Y_val_fold_pred)
    mse_exp_fold = MSE(result_val[Y_val_fold.columns[0]], result_val["Predicted_sales"])
    mse_exp_values.append(mse_exp_fold)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_values)
mean_mse_exp = np.mean(mse_exp_values)
print(f'Mean MSE across all folds: {mean_mse}')
print(f'Mean MSE on original sales scale across all folds: {mean_mse_exp}')


Mean MSE across all folds: 0.3534385941204726
Mean MSE on original sales scale across all folds: 1102.4974981945268
