In [None]:
!pip install numpy pandas matplotlib seaborn --quiet

In [None]:
!pip install jovian opendatasets xgboost graphviz lightgbm scikit-learn xgboost lightgbm --upgrade --quiet

In [None]:
import os
import opendatasets as od
import pandas as pd
pd.set_option("display.max_columns",120)
pd.set_option("display.max_rows",120)

In [None]:
od.download('https://www.kaggle.com/c/rossmann-store-sales')

In [None]:
os.listdir('rossmann-store-sales')

In [None]:
ross_df = pd.read_csv('./rossmann-store-sales/train.csv',low_memory=False)
test_df = pd.read_csv('./rossmann-store-sales/test.csv')
store_df = pd.read_csv('./rossmann-store-sales/store.csv')

In [None]:
ross_df

In [None]:
test_df

In [None]:
store_df

In [None]:
merged_df = ross_df.merge(store_df,how='left',on='Store')
merged_test_df = test_df.merge(store_df,how='left',on='Store')

In [None]:
merged_df

In [None]:
merged_test_df

In [None]:
merged_df.info()

In [None]:
def split_date(df):
    df['Date']=pd.to_datetime(df['Date'])
    df['Year'] = df.Date.dt.year
    df['Month'] = df.Date.dt.month
    df['Day'] = df.Date.dt.day
    df['WeekOfYear'] = df.Date.dt.isocalendar().week

In [None]:
split_date(merged_df)
split_date(merged_test_df)

In [None]:
merged_df

In [None]:
merged_df[merged_df.Open==0].Sales.value_counts()

In [None]:
def comp_months(df):
    df['CompetitionOpen']=12*(df.Year-df.CompetitionOpenSinceYear)+(df.Month-df.CompetitionOpenSinceMonth)
    df['CompetitionOpen']=df['CompetitionOpen'].map(lambda x: 0 if x<0 else x).fillna(0)

In [None]:
comp_months(merged_df)
comp_months(merged_test_df)

In [None]:
merged_df

In [None]:
merged_df[['Date','CompetitionDistance','CompetitionOpenSinceYear','CompetitionOpenSinceMonth','CompetitionOpen']].sample(20)

In [None]:
def check_promo_month(df):
    month2str = {1:'Jan', 2:'Feb' ,3:'March', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        months = (row['PromoInterval'] or '').split(',')
        if(row['Promo2Open'] and month2str[row['Month']]) in months:
            return 1
        else:
            return 0
    except Exception:
        return 0
def promo_cols(df):
    df['Promo2Open'] = 12*(df.Year - df.Promo2SinceYear) + (df.WeekOfYear - df.Promo2SinceWeek)*7/30.5
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x<0 else x).fillna(0)
    df['IsPromo2Month'] = df.apply(check_promo_month,axis=1)*df['Promo2']

In [None]:
promo_cols(merged_df)
promo_cols(merged_test_df)

In [None]:
merged_df[['Date', 'Promo2', 'Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval', 'Promo2Open', 'IsPromo2Month']].sample(20)

In [None]:
merged_df.columns

In [None]:
input_cols = ['Store', 'DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 
              'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpen', 
              'Day', 'Month', 'Year', 'WeekOfYear',  'Promo2', 
              'Promo2Open', 'IsPromo2Month']
target_col = 'Sales'

In [None]:
inputs = merged_df[input_cols].copy()
targets = merged_df[target_col].copy()

In [None]:
test_inputs = merged_test_df[input_cols].copy()

In [None]:
numeric_cols = ['Store', 'Promo', 'SchoolHoliday', 
              'CompetitionDistance', 'CompetitionOpen', 'Promo2', 'Promo2Open', 'IsPromo2Month',
              'Day', 'Month', 'Year', 'WeekOfYear',  ]
categorical_cols = ['DayOfWeek', 'StateHoliday', 'StoreType', 'Assortment']

In [None]:
inputs[numeric_cols].isna().sum()

In [None]:
test_inputs[numeric_cols].isna().sum()

In [None]:
max_distance = inputs.CompetitionDistance.max()

In [None]:
inputs['CompetitionDistance']=inputs['CompetitionDistance'].fillna(max_distance).infer_objects(copy=False)
test_inputs['CompetitionDistance']=test_inputs['CompetitionDistance'].fillna(max_distance)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(inputs[numeric_cols])

In [None]:
inputs[numeric_cols] = scaler.transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore').fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))

In [None]:
inputs[encoded_cols] = encoder.transform(inputs[categorical_cols])
test_inputs[encoded_cols] = encoder.transform(test_inputs[categorical_cols])

In [None]:
X = inputs[numeric_cols+encoded_cols]
X_test = test_inputs[numeric_cols+encoded_cols]

In [None]:
X_test.info()

In [None]:
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(random_state=42,n_jobs=-1,n_estimators=20,max_depth=4)

In [None]:
%%time
model.fit(X,targets)

In [None]:
preds = model.predict(X)

In [None]:
preds

In [None]:
import numpy as np

In [None]:
from sklearn.metrics import mean_squared_error
def rmse(a,b):
    mse = mean_squared_error(a,b)
    return np.sqrt(mse)

In [None]:
rmse(preds,targets)

In [None]:
import matplotlib.pyplot as plt
from xgboost import plot_tree
from matplotlib.pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 30,38

In [None]:
plot_tree(model, rankdir='LR')

In [None]:
trees = model.get_booster().get_dump()

In [None]:
len(trees)

In [None]:
print(trees[0])

In [None]:
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance',ascending=False)

In [None]:
importance_df.head(10)

In [None]:
import seaborn as sns
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
sns.barplot(data=importance_df.head(10),x='importance',y='feature')
plt.show()

In [None]:
from sklearn.model_selection import KFold

In [None]:
def train_and_evaluate(X_train,train_targets,X_val,val_targets,**params):
    model = XGBRegressor(random_state=42,n_jobs=-1,**params)
    model.fit(X_train,train_targets)
    train_rmse = rmse(model.predict(X_train),train_targets)
    val_rmse = rmse(model.predict(X_val),val_targets)
    return model,train_rmse,val_rmse

In [None]:
kfold = KFold(n_splits=5)

In [None]:
models=[]
for train_idxs,val_idxs in kfold.split(X):
    X_train,train_targets=X.iloc[train_idxs],targets.iloc[train_idxs]
    X_val,val_targets = X.iloc[val_idxs],targets.iloc[val_idxs]
    model, train_rmse , val_rmse = train_and_evaluate(X_train,
                                                     train_targets,
                                                     X_val,
                                                     val_targets,
                                                     max_depth=4,
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse,val_rmse))

In [None]:
import numpy as np
def predict_avg(models,inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

In [None]:
preds = predict_avg(models,X)

In [None]:
preds

In [None]:
def test_params_kfold(n_splits,**params):
    train_rmses,val_rmses,models = [],[],[]
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train,train_targets = X.iloc[train_idxs],targets.iloc[train_idxs]
        X_val,val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train,train_targets,X_val,val_targets,**params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
        print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses),np.mean(val_rmses)))
        return models

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
def test_params(**params):
    model = XGBRegressor(n_jobs=-1,random_state=42,**params)
    model.fit(X_train,train_targets)
    train_rmse = rmse(model.predict(X_train),train_targets)
    val_rmse = rmse(model.predict(X_val),val_targets)
    print('Train RMSE: {}, Validation RMSE {}'.format(train_rmse,val_rmse))

In [None]:
test_params(n_estimators=10)

In [None]:
test_params(n_estimators=30)

In [None]:
test_params(n_estimators=100)

In [None]:
test_params(n_estimators=240)

In [None]:
test_params(max_depth=2)

In [None]:
test_params(max_depth=5)

In [None]:
test_params(max_depth=10)

In [None]:
test_params(n_estimators=50, learning_rate=0.01)

In [None]:
test_params(n_estimators=50, learning_rate=0.1)

In [None]:
test_params(n_estimators=50, learning_rate=0.3)

In [None]:
test_params(n_estimators=50, learning_rate=0.9)

In [None]:
test_params(n_estimators=50, learning_rate=0.99)

In [None]:
test_params(booster='gblinear')

In [None]:
model = XGBRegressor(n_jobs=-1, random_state=42, n_estimators=1000, 
                     learning_rate=0.2, max_depth=10, subsample=0.9, 
                     colsample_bytree=0.7)

In [None]:
%%time
model.fit(X,targets)

In [None]:
test_preds = model.predict(X_test)

In [None]:
submission_df = pd.read_csv('./rossmann-store-sales/store.csv')

In [None]:
submission_df['Sales'] = test_preds

In [None]:
test_df.Open.isna().sum()

In [None]:
submission_df['Sales'] = submission_df['Sales'] * test_df.Open.fillna(1.)

In [None]:
submission_df

In [None]:
submission_df.to_csv('submission.csv', index=None)

In [None]:
from IPython.display import FileLink

In [None]:
FileLink('submission.csv')