In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression,Lasso, ElasticNet, BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.kernel_ridge import KernelRidge
import xgboost as xgb
import lightgbm as lgb


TRAIN_PATH = 'data/train_modified.csv'
TEST_PATH = 'data/test_modified.csv'

In [2]:
data_train = pd.read_csv(TRAIN_PATH,index_col=0)
data_test = pd.read_csv(TEST_PATH,index_col=0)
df_test = pd.read_csv('data/test.csv')

In [4]:
data_train.head()

Unnamed: 0,Item_MRP,Item_Outlet_Sales,Item_Visibility,Item_Weight,Outlet_Location_Type,Outlet_Size,Outlet_Age,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,...,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,1.752511,3735.138,-1.079161,-0.751014,0,1,14,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,-1.493696,443.4228,-1.014187,-1.477653,2,1,4,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,0.009874,2097.27,-1.064827,1.011839,0,1,14,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.661838,732.38,-1.043223,1.377308,2,2,15,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,-1.403623,994.7052,-1.205201,-0.830557,2,0,26,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
def split_x_and_y(data):
    y = data['Item_Outlet_Sales']
    X = data.drop('Item_Outlet_Sales',axis=1)
    return X, y

In [6]:
X, y = split_x_and_y(data_train)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

### 1. ML Models with train_test_split testing strategy

In [8]:
def run_train_test_strategy(X_train, y_train, X_test, y_test,model):
    model.fit(X_train, y_train)
    y_pred_lin = model.predict(X_test)
    print(np.sqrt(mean_squared_error(y_pred_lin,y_test)))

In [9]:
print("Linear Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,LinearRegression())

print("DecisionTree Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,DecisionTreeRegressor())

print("RF Regression with train test split strategy")
run_train_test_strategy(X_train, y_train, X_test, y_test,RandomForestRegressor())

Linear Regression with train test split strategy
1128.603221651405
DecisionTree Regression with train test split strategy
1494.3268352385323
RF Regression with train test split strategy
1142.6684761046963


### 2. ML Models with k fold validation strategy

In [10]:
def run_k_fold_strategy(X, y, cv, model):
    scores = cross_val_score(model, X, y, cv=5,scoring='neg_mean_squared_error')
    scores = scores * -1
    for i, score in enumerate(scores):
        print("CV No: ", i)
        print(np.sqrt(score))

In [11]:
print("Linear Regression with train test split strategy")
run_k_fold_strategy(X, y, 5, LinearRegression())


print("RF Regression with train test split strategy")
run_k_fold_strategy(X, y, 5, RandomForestRegressor())

Linear Regression with train test split strategy
CV No:  0
1152.9755207544936
CV No:  1
1123.1388747238962
CV No:  2
1115.3036683332425
CV No:  3
1129.8403462011713
CV No:  4
1143.09341096601
RF Regression with train test split strategy
CV No:  0
1164.6333006998457
CV No:  1
1191.851410880866
CV No:  2
1163.4863519200042
CV No:  3
1162.7729216556982
CV No:  4
1172.0700478007611


In [12]:
def cross_val_testing(model, X, y, cv=5):
    kf = KFold(cv, shuffle=True, random_state=42).get_n_splits(X)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))
    return rmse

In [13]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha = 0.0005, random_state=1))

In [14]:
cross_val_testing(lasso, X, y)



array([1152.97457055, 1123.13782907, 1115.30252062, 1129.83912762,
       1143.09280738])

In [15]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [16]:
cross_val_testing(ENet, X, y)



array([1152.97757281, 1123.12850767, 1115.28299232, 1129.82678459,
       1143.08802658])

In [17]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [18]:
cross_val_testing(KRR, X, y)

array([1101.15310265, 1083.56270637, 1071.16282472, 1077.45539295,
       1087.26700641])

### 3. Solution Submissions

In [19]:
def predict_sales(X, y, X_test, model, filename):
    df_test = pd.read_csv('data/test.csv')
    model.fit(X, y)
    ypred = model.predict(X_test)
    df_test['Item_Outlet_Sales'] = ypred
    df_test[['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales']].to_csv('out/'+ filename +'.csv',index=False)

In [20]:
X_test = data_test.copy()

predict_sales(X, y,X_test, ENet, 'enet')
predict_sales(X, y,X_test, lasso, 'lasso')
predict_sales(X, y,X_test, KRR, 'krr')



### 4. Ensembling 

In [21]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [22]:
cross_val_testing(model_xgb, X, y)

array([1158.45493111, 1154.43489425, 1140.94498556, 1131.66905941,
       1144.23008648])

In [23]:
predict_sales(X, y,X_test, model_xgb, 'xgb')

In [27]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [28]:
cross_val_testing(model_lgb, X, y)

array([1110.00281096, 1085.94335838, 1080.53821066, 1080.45774021,
       1090.02085397])

In [29]:
predict_sales(X, y,X_test, model_lgb, 'lgb')