# Imports


In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_squared_log_error,
)


# The Functions


Here are the functions we created in the previous notebook. We will be creating some more functions to make our life easier.


In [80]:
def merge_dataframes(file, log_sale=False):
    """
    Takes the file name of the dataframe merges the train and test dataframe into one dataframe
    also does some transformations to the dataframe

    Parameters
    ----------
    file : str
        The file name of the dataframe
    log_sale : bool
        Whether to log the sale price or not

    Returns
    -------
    (dataframe, dataframe, dataframe), dataframe
        ((train_data, test_data, y_train), test_indices)
    """
    train_data = pd.read_csv(os.path.join("data", "train_large", file)).set_index("id")
    test_data = pd.read_csv(os.path.join("data", "test_large", file)).set_index("id")

    train_indices = train_data.index
    test_indices = test_data.index
    y_train = train_data.sales.values

    merged_df = pd.concat([train_data, test_data], axis=0)
    merged_df = merged_df.drop(["date", "sales", "store_nbr"], axis=1)
    merged_indices = merged_df.index

    one_hot_cols = merged_df.columns[
        (merged_df.dtypes == "object") | (merged_df.dtypes == "bool")
    ]
    one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown="error", drop="first")
    column_transformer = ColumnTransformer(
        [
            ("categorical_cols", one_hot_encoder, one_hot_cols),
        ],
        remainder="passthrough",
    )
    if log_sale:
        y_train = np.log(y_train + 1)

    merged_df = column_transformer.fit_transform(merged_df)
    merged_df = pd.DataFrame(merged_df, index=merged_indices)
    train_data = merged_df.loc[train_indices]
    test_data = merged_df.loc[test_indices]
    return (train_data, test_data, y_train), test_indices


In [6]:
def create_train_test(X, y, ratio=0.8):
    """
    Creates the train test split

    Parameters
    ----------
    X : dataframe
        The dataframe to split
    y : dataframe
        The target dataframe
    ratio : float
        The ratio of the train data to the test data

    Returns
    -------
    (dataframe, dataframe, dataframe, dataframe)
        (X_train, y_train, X_test, y_test)
    """
    X = X.values
    X_train = X[:int(len(X) * ratio)]
    y_train = y[:int(len(y) * ratio)]
    X_test = X[int(len(X) * ratio):]
    y_test = y[int(len(y) * ratio):]
    return X_train, y_train, X_test, y_test
    

In [7]:
def evaluate(
    model, data=None, on="test", log_sale=False, return_preds=False, no_print=False
):
    """
    Evaluates the model on the test data or the train data

    Parameters
    ----------
    model : sklearn model
        The model to evaluate
    on : str
        The data to evaluate on
    log_sale : bool
        Whether to log the sale price or not
    return_preds : bool
        Whether to return the predictions or not

    Returns
    -------
    (dict, dataframe)|(dataframe)
        (metrics, y_pred) or y_pred
    """
    if data is None:
        if on == "test":
            X = X_test
            y = y_test
        elif on == "train":
            X = X_train
            y = y_train
    else:
        X = data[0]
        y = data[1]
    # print("Predicting")
    preds = model.predict(X)
    if log_sale:
        # print("Exponentiating")
        preds = np.exp(preds) - 1
        y = np.exp(y) - 1
    # print("Calculating Metrics")
    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)
    try:
        msle = mean_squared_log_error(y, preds)
    except:
        preds = np.abs(preds)
        msle = mean_squared_log_error(y, preds)
    rmse = np.sqrt(msle)
    r2_value = r2_score(y, preds)
    if not no_print:
        print("MSE:", mse)
        print("MAE:", mae)
        print("MSLE:", msle)
        print("RMSE:", rmse)
        print("R2:", r2_value)

    metrics = {"mse": mse, "mae": mae, "msle": msle, "rmse": rmse, "r2": r2_value}
    if return_preds:
        return metrics, preds
    else:
        return metrics


In [8]:
def create_pred_dataframe(X, model, id=None, log_sale=False):
    """
    Creates a dataframe of the predictions

    Parameters
    ----------
    X : dataframe
        The dataframe to predict on
    model : sklearn model
        The model to predict with
    id : str
        The id of the dataframe
    log_sale : bool
        Whether to log the sale price or not

    Returns
    -------
    dataframe
        The dataframe of the predictions
    """
    preds = model.predict(X)
    if log_sale:
        preds = np.exp(preds) - 1
    if id is None:
        id = np.arange(len(preds))
    df = pd.DataFrame(preds, columns=["sales"])
    df = pd.DataFrame(id, columns=["id"])
    df["sales"] = preds
    df.reset_index(drop=True, inplace=True)
    return df


In [115]:
performace_on_train = {}
performace_on_test = {}


In [87]:
def train_and_evaluate(
    model, file, log_sale=False, return_preds=True, on="test", no_print=False
):
    (train_data, test_data, y), test_indices = merge_dataframes(file)
    X_train, y_train, X_test, y_test = create_train_test(train_data, y)
    model.fit(X_train, y_train)
    if on == "test":
        data = (X_test, y_test)
    elif on == "train":
        data = (X_train, y_train)
    if return_preds:
        metrics, preds = evaluate(
            model,
            data=data,
            on=on,
            log_sale=log_sale,
            return_preds=return_preds,
            no_print=no_print,
        )
    else:
        metrics = evaluate(
            model,
            data=data,
            on=on,
            log_sale=log_sale,
            return_preds=return_preds,
            no_print=no_print,
        )
        preds = None
    return metrics, preds


def train_evaluate_and_save(model_name, files, on="test", **kwargs):
    i = 0
    for file in tqdm.tqdm(files, desc="Training and Evaluating"):
        name = model_name + "_" + str(i)
        # print("Performance for:", file)
        metrics, _ = train_and_evaluate(file=file, **kwargs)
        # print("\n")
        if on == "test":
            performace_on_test[name] = metrics
        else:
            performace_on_train[name] = metrics
        i += 1


In [29]:
def train_and_predict(model, file):
    (train_data, test_data, y), test_indices = merge_dataframes(file)
    X_train, y_train, X_test, y_test = create_train_test(train_data, y)
    model.fit(X_train, y_train)
    df = create_pred_dataframe(test_data, model, id=test_indices, log_sale=False)
    return df


# Models


## How To Proceed


We have 88 dataframes. This means that searching for the best model for each model might not possible. That's why we'll be using 3 different datasets to train and evaluate models. The model performing the best on these three datasets will be used as a final model. Let's get these files.


In [83]:
random.seed(42)
TRAIN_DIR = "data/train_large"
TEST_DIR = "data/test_large"

all_files = os.listdir(TRAIN_DIR)
sample_files = random.sample(all_files, 15)
sample_files


['D_HOME AND KITCHEN I.csv',
 'A_PERSONAL CARE.csv',
 'I_HOME APPLIANCES.csv',
 'H_LADIESWEAR.csv',
 'G_PRODUCE.csv',
 'E_EGGS.csv',
 'D_BREAD_BAKERY.csv',
 'Q_PRODUCE.csv',
 'C_MAGAZINES.csv',
 'N_BEVERAGES.csv',
 'A_SEAFOOD.csv',
 'A_PRODUCE.csv',
 'C_PREPARED FOODS.csv',
 'G_PERSONAL CARE.csv',
 'H_CLEANING.csv']

## Base Model


The linear model with default parameters will be our base model.


In [61]:
lr = LinearRegression()


In [25]:
train_evaluate_and_save("lr", all_files, on="test", model=lr, no_print=True)

In [26]:
pd.DataFrame(performace_on_test).T.describe()

Unnamed: 0,mse,mae,msle,rmse,r2
count,561.0,561.0,561.0,561.0,561.0
mean,327484.9,170.977357,1.019108,0.804658,-0.119732
std,1459290.0,418.052615,2.133972,0.610162,1.082579
min,0.0,0.0,0.0,0.0,-22.008952
25%,22.40923,3.565149,0.258556,0.508484,-0.150382
50%,1009.774,21.355437,0.387161,0.622223,0.011335
75%,27866.49,114.520242,0.712604,0.844159,0.133981
max,16310040.0,3504.420265,16.108087,4.013488,1.0


In [88]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)

In [95]:
train_evaluate_and_save("lr", sample_files, on="test", model=lr, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:00<00:00, 17.59it/s]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,1152706.0,448.869352,1.903328,0.995147,-0.086479
std,2857261.0,808.2832,4.272048,0.989053,0.446348
min,2.185623,1.147306,0.176412,0.420014,-1.142586
25%,236.0164,12.64748,0.328302,0.572971,-0.005841
50%,2982.713,42.155458,0.393245,0.627093,0.044532
75%,194493.2,304.288173,0.627764,0.792159,0.122706
max,10565740.0,2665.786013,16.108087,4.013488,0.478779


In [90]:
train_evaluate_and_save("rf", sample_files, on="test", model=rf, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [02:24<00:00,  9.65s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,535626.8,282.952732,1.403092,0.888038,-0.155364
std,1463939.0,509.322561,3.220538,0.811401,1.490958
min,2.230267,1.223556,0.181099,0.425557,-5.457825
25%,488.1806,13.491372,0.245891,0.495838,0.055708
50%,2811.786,40.098166,0.376816,0.613853,0.142371
75%,167077.4,262.483179,0.574146,0.756887,0.327312
max,5661678.0,1872.816936,12.749317,3.570619,0.684669


In [104]:
df = train_and_predict(rf, all_files[0])
for file in tqdm.tqdm(all_files[1:], desc="Predicting..."):
    df_temp = train_and_predict(rf, file)
    df = pd.concat([df, df_temp])

Predicting...: 100%|██████████| 560/560 [1:22:25<00:00,  8.83s/it]


In [105]:
df


Unnamed: 0,id,sales
0,3001119,3.990006
1,3001647,3.990006
2,3001713,3.990006
3,3001746,3.990006
4,3001812,3.990006
...,...,...
11,3022040,74.796591
12,3023822,61.108806
13,3025604,61.092706
14,3027386,62.044200


In [108]:
df_ordered = df.sort_values(by="id")
df_ordered


Unnamed: 0,id,sales
0,3000888,6.061122
0,3000889,0.088900
0,3000890,5.591964
0,3000891,2525.394851
0,3000892,0.000000
...,...,...
95,3029395,260.485592
95,3029396,61.308257
95,3029397,962.511374
95,3029398,28.548445


In [97]:
df_ordered["sales"] = df_ordered["sales"].apply(lambda x: np.abs(x))

In [109]:
df_ordered.describe()


Unnamed: 0,id,sales
count,28512.0,28512.0
mean,3015144.0,444.41142
std,8230.85,1161.72534
min,3000888.0,0.0
25%,3008016.0,4.007548
50%,3015144.0,26.670612
75%,3022271.0,273.915931
max,3029399.0,13029.126197


In [54]:
sample_file = pd.read_csv(r"data\sample_submission.csv")
sample_file


Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0
...,...,...
28507,3029395,0.0
28508,3029396,0.0
28509,3029397,0.0
28510,3029398,0.0


In [110]:
df_ordered.to_csv("data/submissions/submission_4.csv", index=False)

In [111]:
!kaggle competitions submit -c store-sales-time-series-forecasting -f data/submissions/submission_4.csv -m "Submission 4"

Successfully submitted to Store Sales - Time Series Forecasting


  0%|          | 0.00/747k [00:00<?, ?B/s]
  1%|          | 8.00k/747k [00:00<00:17, 43.4kB/s]
 88%|████████▊ | 656k/747k [00:00<00:01, 61.8kB/s] 
100%|██████████| 747k/747k [00:06<00:00, 126kB/s] 





## SVM


In [113]:
svr = SVR()


In [114]:
train_evaluate_and_save("svr", sample_files, on="test", model=svr, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:27<00:00,  1.81s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,3737842.0,714.770034,3.558172,1.311001,-0.814647
std,11706710.0,1513.37978,8.601777,1.403865,0.925058
min,2.266035,1.123908,0.193715,0.440131,-2.497527
25%,376.9374,13.928379,0.398227,0.630786,-1.574476
50%,3347.95,43.726959,0.582753,0.763383,-0.312428
75%,402318.9,460.761219,2.124505,1.395359,-0.089637
max,45608630.0,5711.292509,33.813064,5.8149,0.000406


In [116]:
xgbr = XGBRegressor(n_estimators=1000, max_depth=15, random_state=42)

In [117]:
train_evaluate_and_save("xgbr", sample_files, on="test", model=xgbr, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [01:08<00:00,  4.55s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,607925.0,304.543247,1.414998,0.939534,-1.068478
std,1571874.0,524.371921,2.983166,0.755178,4.27131
min,2.552302,1.286108,0.240963,0.490879,-16.455198
25%,566.241,15.045171,0.357844,0.595466,-0.213539
50%,4606.172,47.918513,0.466449,0.682971,0.022421
75%,205621.6,296.464917,0.693483,0.832272,0.139652
max,6023477.0,1873.34991,11.97016,3.459792,0.58818


## Random Forest


In [141]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)


In [142]:
train_evaluate_and_save("rf", sample_files, on="test", model=rf)


Performance for: Z_ELECTRONICS.csv
MSE: 98.1055837355651
MAE: 6.3263198374720115
MSLE: 1.5462805766316052
RMSE: 1.2434953062362581
R2: 0.1527268110599328


Performance for: B_MISC.csv
MSE: 99898.88582289574
MAE: 134.62437606350883
MSLE: 5.475320630420998
RMSE: 2.3399403048840792
R2: 0.6876344557356231


Performance for: A_GROCERY.csv
MSE: 1734977.9627152565
MAE: 751.8587089195961
MSLE: 4.47068627915171
RMSE: 2.1143997444077858
R2: 0.5082597263533852




In [143]:
train_evaluate_and_save("rf", sample_files, on="train", model=rf)


Performance for: Z_ELECTRONICS.csv
MSE: 98.1055837355651
MAE: 6.3263198374720115
MSLE: 1.5462805766316052
RMSE: 1.2434953062362581
R2: 0.1527268110599328


Performance for: B_MISC.csv
MSE: 99898.88582289574
MAE: 134.62437606350883
MSLE: 5.475320630420998
RMSE: 2.3399403048840792
R2: 0.6876344557356231


Performance for: A_GROCERY.csv
MSE: 1734977.9627152565
MAE: 751.8587089195961
MSLE: 4.47068627915171
RMSE: 2.1143997444077858
R2: 0.5082597263533852


