In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor


In [47]:
np.random.seed(42)

**1. Import the train datasets**

In [48]:
X_train = pd.read_csv("Data-Train.csv",index_col=0)
Y_train = pd.read_csv("train_y_logsales.csv",index_col=0)


**2. Setup Pipeline**

In [49]:

Model_1 = KNeighborsRegressor()
RG = Model_1
pipe = Pipeline(steps=[("RG",RG)])
param_grid = {
    "RG__n_neighbors":[1,4,5,6,7,8,10]
    
}

**3. Setup GridSearch**

In [50]:
score_metric = 'neg_mean_squared_error'
search = GridSearchCV(pipe,param_grid,
                      n_jobs=-1,
                      cv=10,
                      scoring=score_metric)

In [51]:
search.fit(X_train,Y_train)

In [52]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=-0.325):
{'RG__n_neighbors': 5}


In [53]:
df = pd.DataFrame(search.cv_results_)
df = df.sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_RG__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
2,0.007839,0.003218,0.008346,0.00166,5,{'RG__n_neighbors': 5},-0.359581,-0.276294,-0.398794,-0.272798,-0.367577,-0.340862,-0.284651,-0.304544,-0.295445,-0.344715,-0.324526,0.041364,1
6,0.007244,0.003049,0.007754,0.002803,10,{'RG__n_neighbors': 10},-0.340461,-0.269577,-0.387311,-0.289259,-0.360343,-0.375798,-0.295314,-0.299461,-0.306538,-0.34263,-0.326669,0.038023,2
3,0.00563,0.004396,0.010254,0.002766,6,{'RG__n_neighbors': 6},-0.355138,-0.27877,-0.397528,-0.287252,-0.371143,-0.348285,-0.289857,-0.301141,-0.298837,-0.36413,-0.329208,0.040305,3
5,0.005958,0.003961,0.008938,0.003329,8,{'RG__n_neighbors': 8},-0.336635,-0.271887,-0.389757,-0.299724,-0.3632,-0.366281,-0.294045,-0.31238,-0.30088,-0.358952,-0.329374,0.036895,4
4,0.007571,0.00307,0.008176,0.002752,7,{'RG__n_neighbors': 7},-0.340935,-0.278295,-0.396146,-0.294465,-0.373714,-0.363962,-0.289176,-0.305318,-0.303578,-0.365185,-0.331077,0.039618,5
1,0.00756,0.002718,0.165943,0.129911,4,{'RG__n_neighbors': 4},-0.391376,-0.275143,-0.418187,-0.265753,-0.369271,-0.371801,-0.290169,-0.324366,-0.309921,-0.347936,-0.336392,0.048893,6
0,0.007561,0.002802,0.275663,0.010562,1,{'RG__n_neighbors': 1},-0.432622,-0.379055,-0.617325,-0.439959,-0.523675,-0.439675,-0.389265,-0.470989,-0.407696,-0.492347,-0.459261,0.067693,7


**4. Best Hyperparameter**

In [55]:
parms = pd.DataFrame(df['params'])
for i in parms.iloc[0]:
    print(i)

{'RG__n_neighbors': 5}


**5. Evaluate Model Log(Sales)**

In [56]:
best_n = parms.iloc[0][0]['RG__n_neighbors']  

# Instantiate the KNeighborsRegressor model with the best number of neighbors
Model_2 = KNeighborsRegressor(n_neighbors=best_n)

# Split the data into a training set and a test set
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_train, Y_train, test_size=0.2, random_state=132)

# Fit the model on the training data
RG1 = Model_2
RG1.fit(X_train1, Y_train1)

# Predict on the test data and calculate the MSE
mse_value = mean_squared_error(Y_test1, RG1.predict(X_test1))
print(f'Mean Squared Error scaled: {mse_value}')

# Prepare a DataFrame to hold the true and predicted sales
# Exponentiate the values if the sales have been log-transformed
result = pd.DataFrame()
result['True_sales'] = np.exp(Y_test1)  # Exponentiate if Y_train1 is log-transformed

# Add the predicted sales to the DataFrame and exponentiate if the predictions are log-transformed
result['Predicted_sales'] = np.exp(RG1.predict(X_test1))

# Calculate the MSE between the true and predicted sales
final_mse = mean_squared_error(result['True_sales'], result['Predicted_sales'])
print(f'Mean Squared Error between true and predicted sales: {final_mse}')

Mean Squared Error scaled: 0.3286005150772502
Mean Squared Error between true and predicted sales: 1275.155521716769


  best_n = parms.iloc[0][0]['RG__n_neighbors']


**5.1 Model using original sales data**

In [57]:

Model_3 = KNeighborsRegressor(n_neighbors=best_n)

# Load the target variable from a CSV file
OY_train = pd.read_csv("train_y_sales.csv", index_col=0)

# Split the data into training and testing sets
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X_train, OY_train, test_size=0.2, random_state=132)

# Instantiate the model with the best number of neighbors
RG2 = Model_3

# Fit the model on the training data
RG2.fit(X_train2, Y_train2)

# Predict on the test data
Y_test2_pred = RG2.predict(X_test2)

# Calculate the Mean Squared Error on the test data
mse_value = mean_squared_error(Y_test2, Y_test2_pred)
print(f'Mean Squared Error on test data: {mse_value}')

Mean Squared Error on test data: 1401.85835443038
