In [14]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression,Lasso, ElasticNet, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import lightgbm as lgb


TRAIN_PATH = 'data/train_modified.csv'
TEST_PATH = 'data/test_modified.csv'

In [41]:
data_train = pd.read_csv(TRAIN_PATH,index_col=0)
data_test = pd.read_csv(TEST_PATH,index_col=0)
df_test = pd.read_csv('data/test.csv')

In [7]:
def split_x_and_y(data):
    y = data['Item_Outlet_Sales']
    X = data.drop('Item_Outlet_Sales',axis=1)
    return X, y

In [8]:
X, y = split_x_and_y(data_train)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### 1. ML Models with train_test_split testing strategy

In [13]:
def run_train_test_strategy(X_train, y_train, X_test, y_test,model):
    model.fit(X_train, y_train)
    y_pred_lin = model.predict(X_test)
    print(np.sqrt(mean_squared_error(y_pred_lin,y_test)))

In [16]:
print("Linear Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,LinearRegression())

print("DecisionTree Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,DecisionTreeRegressor())

print("RF Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,RandomForestRegressor())

Linear Regression with train test split strategy
1151.488338169082
DecisionTree Regression with train test split strategy
1604.6756264399035
RF Regression with train test split strategy
1193.7423095689046


### 2. ML Models with k fold validation strategy

In [18]:
def run_k_fold_strategy(X, y, cv, model):
    scores = cross_val_score(model, X, y, cv=5,scoring='neg_mean_squared_error')
    scores = scores * -1
    for i, score in enumerate(scores):
        print("CV No: ", i)
        print(np.sqrt(score))

In [20]:
print("Linear Regression with train test split strategy")
run_k_fold_strategy(X, y, 5, LinearRegression())


print("RF Regression with train test split strategy")
run_k_fold_strategy(X, y, 5, RandomForestRegressor())

Linear Regression with train test split strategy
CV No:  0
1151.2245864639694
CV No:  1
1125.1828511536687
CV No:  2
1116.038247055086
CV No:  3
1132.0195240632083
CV No:  4
1143.9074383182465
RF Regression with train test split strategy
CV No:  0
1185.6906358121576
CV No:  1
1203.8891436603037
CV No:  2
1189.4675023608359
CV No:  3
1182.2101774735338
CV No:  4
1177.8739141160202


In [21]:
def cross_val_testing(model, X, y, cv=5):
    kf = KFold(cv, shuffle=True, random_state=42).get_n_splits(X)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

In [43]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=1))

In [23]:
cross_val_testing(lasso, X, y)



array([1151.22447498, 1125.18160288, 1116.03640569, 1132.01785519,
       1143.90703763])

In [24]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [25]:
cross_val_testing(ENet, X, y)



array([1151.2400293 , 1125.16970889, 1116.00377933, 1131.99357003,
       1143.90884957])

In [26]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [27]:
cross_val_testing(KRR, X, y)

array([1099.98704966, 1086.74714364, 1070.86742772, 1079.18035022,
       1089.18426198])

### 3. Solution Submissions

In [38]:
KRR.fit(X, y)
ypred = KRR.predict(data_test)
df_test['Item_Outlet_Sales'] = ypred
df_test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']].to_csv('out/submission_krr.csv',index=False)

In [40]:
ENet.fit(X, y)
ypred = ENet.predict(data_test)
df_test['Item_Outlet_Sales'] = ypred
df_test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']].to_csv('out/submission_enet.csv',index=False)



In [45]:
lasso.fit(X, y)
ypred = lasso.predict(data_test)
df_test['Item_Outlet_Sales'] = ypred
df_test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']].to_csv('out/submission_lasso.csv',index=False)

