# Imports


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import random
import tqdm

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_squared_log_error,
)


# Downloading Files

## The *small* datasets

In [2]:
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/store_sales_small.zip

--2022-06-26 10:16:55--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/store_sales_small.zip
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.160.50
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.160.50|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 197658254 (189M) [application/zip]
Saving to: ‘store_sales_small.zip’


2022-06-26 10:17:13 (11.1 MB/s) - ‘store_sales_small.zip’ saved [197658254/197658254]



In [3]:
!unzip store_sales_small.zip

Archive:  store_sales_small.zip
 extracting: data/small/test/A_BEVERAGE.csv  
 extracting: data/small/test/A_ELECTRONICS.csv  
 extracting: data/small/test/A_FOOD.csv  
 extracting: data/small/test/A_GROCERY.csv  
 extracting: data/small/test/A_HOUSEHOLD.csv  
 extracting: data/small/test/A_LADIES.csv  
 extracting: data/small/test/A_MISC.csv  
 extracting: data/small/test/A_STATIONARY.csv  
 extracting: data/small/test/B_BEVERAGE.csv  
 extracting: data/small/test/B_ELECTRONICS.csv  
 extracting: data/small/test/B_FOOD.csv  
 extracting: data/small/test/B_GROCERY.csv  
 extracting: data/small/test/B_HOUSEHOLD.csv  
 extracting: data/small/test/B_LADIES.csv  
 extracting: data/small/test/B_MISC.csv  
 extracting: data/small/test/B_STATIONARY.csv  
 extracting: data/small/test/C_BEVERAGE.csv  
 extracting: data/small/test/C_ELECTRONICS.csv  
 extracting: data/small/test/C_FOOD.csv  
 extracting: data/small/test/C_GROCERY.csv  
 extracting: data/small/test/C_HOUSEHOLD.csv  
 extracting: 

## The *large* Datasets

In [7]:
!wget https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/store_sales_large.zip

--2022-06-26 10:18:09--  https://h31416-ml-datasets.s3.ap-south-1.amazonaws.com/store_sales/store_sales_large.zip
Resolving h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)... 52.219.156.158
Connecting to h31416-ml-datasets.s3.ap-south-1.amazonaws.com (h31416-ml-datasets.s3.ap-south-1.amazonaws.com)|52.219.156.158|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 188496690 (180M) [application/zip]
Saving to: ‘store_sales_large.zip’


2022-06-26 10:18:26 (11.1 MB/s) - ‘store_sales_large.zip’ saved [188496690/188496690]



In [8]:
!unzip store_sales_large.zip

Archive:  store_sales_large.zip
 extracting: data/large/test_large/A_AUTOMOTIVE.csv  
 extracting: data/large/test_large/A_BABY CARE.csv  
 extracting: data/large/test_large/A_BEAUTY.csv  
 extracting: data/large/test_large/A_BEVERAGES.csv  
 extracting: data/large/test_large/A_BOOKS.csv  
 extracting: data/large/test_large/A_BREAD_BAKERY.csv  
 extracting: data/large/test_large/A_CELEBRATION.csv  
 extracting: data/large/test_large/A_CLEANING.csv  
 extracting: data/large/test_large/A_DAIRY.csv  
 extracting: data/large/test_large/A_DELI.csv  
 extracting: data/large/test_large/A_EGGS.csv  
 extracting: data/large/test_large/A_FROZEN FOODS.csv  
 extracting: data/large/test_large/A_GROCERY I.csv  
 extracting: data/large/test_large/A_GROCERY II.csv  
 extracting: data/large/test_large/A_HARDWARE.csv  
 extracting: data/large/test_large/A_HOME AND KITCHEN I.csv  
 extracting: data/large/test_large/A_HOME AND KITCHEN II.csv  
 extracting: data/large/test_large/A_HOME APPLIANCES.csv  
 e

In [9]:
!ls data

large  small


# The Functions


Here are the functions we created in the previous notebook. We will be creating some more functions to make our life easier.


In [49]:
def merge_dataframes(file, log_sale=False, large=True):
    """
    Takes the file name of the dataframe merges the train and test dataframe into one dataframe
    also does some transformations to the dataframe

    Parameters
    ----------
    file : str
        The file name of the dataframe
    log_sale : bool
        Whether to log the sale price or not

    Returns
    -------
    (dataframe, dataframe, dataframe), dataframe
        ((train_data, test_data, y_train), test_indices)
    """
    if large:
        train_data_dir = os.path.join("data", "large", "train_large")
        test_data_dir = os.path.join("data", "large", "test_large")
    else:
        train_data_dir = os.path.join("data", "small", "train")
        test_data_dir = os.path.join("data", "small", "test")
    train_data = pd.read_csv(os.path.join(train_data_dir, file)).set_index("id")
    test_data = pd.read_csv(os.path.join(test_data_dir, file)).set_index("id")

    train_indices = train_data.index
    test_indices = test_data.index
    y_train = train_data.sales.values

    merged_df = pd.concat([train_data, test_data], axis=0)
    if large:
        cols_to_drop = ["date", "sales", "store_nbr",]
    else:
        cols_to_drop = ["date", "sales"]
    merged_df = merged_df.drop(cols_to_drop, axis=1)
    merged_indices = merged_df.index

    one_hot_cols = merged_df.columns[
        (merged_df.dtypes == "object") | (merged_df.dtypes == "bool")
    ]
    one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown="error", drop="first")
    column_transformer = ColumnTransformer(
        [
            ("categorical_cols", one_hot_encoder, one_hot_cols),
        ],
        remainder="passthrough",
    )
    if log_sale:
        y_train = np.log(y_train + 1)

    merged_df = column_transformer.fit_transform(merged_df)
    merged_df = pd.DataFrame(merged_df, index=merged_indices)
    train_data = merged_df.loc[train_indices]
    test_data = merged_df.loc[test_indices]
    return (train_data, test_data, y_train), test_indices


In [13]:
def create_train_test(X, y, ratio=0.8):
    """
    Creates the train test split

    Parameters
    ----------
    X : dataframe
        The dataframe to split
    y : dataframe
        The target dataframe
    ratio : float
        The ratio of the train data to the test data

    Returns
    -------
    (dataframe, dataframe, dataframe, dataframe)
        (X_train, y_train, X_test, y_test)
    """
    X = X.values
    X_train = X[:int(len(X) * ratio)]
    y_train = y[:int(len(y) * ratio)]
    X_test = X[int(len(X) * ratio):]
    y_test = y[int(len(y) * ratio):]
    return X_train, y_train, X_test, y_test
    

In [14]:
def evaluate(
    model, data=None, on="test", log_sale=False, return_preds=False, no_print=False
):
    """
    Evaluates the model on the test data or the train data

    Parameters
    ----------
    model : sklearn model
        The model to evaluate
    on : str
        The data to evaluate on
    log_sale : bool
        Whether to log the sale price or not
    return_preds : bool
        Whether to return the predictions or not

    Returns
    -------
    (dict, dataframe)|(dataframe)
        (metrics, y_pred) or y_pred
    """
    if data is None:
        if on == "test":
            X = X_test
            y = y_test
        elif on == "train":
            X = X_train
            y = y_train
    else:
        X = data[0]
        y = data[1]
    # print("Predicting")
    preds = model.predict(X)
    if log_sale:
        # print("Exponentiating")
        preds = np.exp(preds) - 1
        y = np.exp(y) - 1
    # print("Calculating Metrics")
    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)
    try:
        msle = mean_squared_log_error(y, preds)
    except:
        preds = np.abs(preds)
        msle = mean_squared_log_error(y, preds)
    rmse = np.sqrt(msle)
    r2_value = r2_score(y, preds)
    if not no_print:
        print("MSE:", mse)
        print("MAE:", mae)
        print("MSLE:", msle)
        print("RMSE:", rmse)
        print("R2:", r2_value)

    metrics = {"mse": mse, "mae": mae, "msle": msle, "rmse": rmse, "r2": r2_value}
    if return_preds:
        return metrics, preds
    else:
        return metrics


In [15]:
def create_pred_dataframe(X, model, id=None, log_sale=False):
    """
    Creates a dataframe of the predictions

    Parameters
    ----------
    X : dataframe
        The dataframe to predict on
    model : sklearn model
        The model to predict with
    id : str
        The id of the dataframe
    log_sale : bool
        Whether to log the sale price or not

    Returns
    -------
    dataframe
        The dataframe of the predictions
    """
    preds = model.predict(X)
    if log_sale:
        preds = np.exp(preds) - 1
    if id is None:
        id = np.arange(len(preds))
    df = pd.DataFrame(preds, columns=["sales"])
    df = pd.DataFrame(id, columns=["id"])
    df["sales"] = preds
    df.reset_index(drop=True, inplace=True)
    return df


In [62]:
def train_and_evaluate(
    model, file, log_sale=False, return_preds=True, on="test", no_print=False, large=True
):
    (train_data, test_data, y), test_indices = merge_dataframes(file, large=large)
    X_train, y_train, X_test, y_test = create_train_test(train_data, y)
    model.fit(X_train, y_train)
    if on == "test":
        data = (X_test, y_test)
    elif on == "train":
        data = (X_train, y_train)
    if return_preds:
        metrics, preds = evaluate(
            model,
            data=data,
            on=on,
            log_sale=log_sale,
            return_preds=return_preds,
            no_print=no_print,
        )
    else:
        metrics = evaluate(
            model,
            data=data,
            on=on,
            log_sale=log_sale,
            return_preds=return_preds,
            no_print=no_print,
        )
        preds = None
    return metrics, preds


def train_evaluate_and_save(model_name, files, data="test", **kwargs):
    i = 0
    for file in tqdm.tqdm(files, desc="Training and Evaluating"):
        name = model_name + "_" + str(i)
        # print("Performance for:", file)
        metrics, _ = train_and_evaluate(file=file, **kwargs)
        # print("\n")
        if data == "test":
            performace_on_test[name] = metrics
        else:
            performace_on_train[name] = metrics
        i += 1


In [18]:
def train_and_predict(model, file):
    (train_data, test_data, y), test_indices = merge_dataframes(file)
    X_train, y_train, X_test, y_test = create_train_test(train_data, y)
    model.fit(X_train, y_train)
    df = create_pred_dataframe(test_data, model, id=test_indices, log_sale=False)
    return df


# Models


## How To Proceed


We have 88 dataframes for the *small* dataset and 561 for the *large*. This means that searching for the best model for each model might not possible. That's why we'll be using 15 different datasets to train and evaluate models. The model performing the best on these three datasets will be used as a final model. First we'll do this for the *small* datasets.


## Small Dataset

In [26]:
random.seed(42)
TRAIN_DIR = "data/small/train"
TEST_DIR = "data/small/test"

all_files = os.listdir(TRAIN_DIR)
sample_files = random.sample(all_files, 15)
sample_files

['G_LADIES.csv',
 'C_HOUSEHOLD.csv',
 'I_HOUSEHOLD.csv',
 'B_GROCERY.csv',
 'I_MISC.csv',
 'Z_GROCERY.csv',
 'G_HOUSEHOLD.csv',
 'I_FOOD.csv',
 'G_MISC.csv',
 'J_HOUSEHOLD.csv',
 'J_STATIONARY.csv',
 'H_BEVERAGE.csv',
 'H_LADIES.csv',
 'F_GROCERY.csv',
 'J_ELECTRONICS.csv']

In [27]:
len(all_files)

88

### The Base Model

In [30]:
!ls data/large

test_large  train_large


The linear model with default parameters will be our base model.


In [63]:
lr = LinearRegression()

In [64]:
performace_on_test = {}
train_evaluate_and_save("lr1", sample_files, data="test", on="test", model=lr, no_print=True, large=False)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:01<00:00,  7.71it/s]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,2659221.0,804.080717,6.576132,2.28165,-0.070693
std,3566972.0,760.203159,7.688913,1.211642,0.787876
min,34.90122,3.760181,1.090414,1.044229,-2.027237
25%,88313.88,182.707047,2.464969,1.568044,-0.000756
50%,1066441.0,674.208648,3.816975,1.953708,0.146233
75%,4865453.0,1431.583123,6.416486,2.532814,0.358035
max,10949650.0,2199.494702,26.247583,5.12324,0.657434


In [65]:
performace_on_train = {}
train_evaluate_and_save("lr1", sample_files, data="train", on="train", model=lr, no_print=True, large=False)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:01<00:00,  7.95it/s]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,1997686.0,676.238625,10.760396,3.008552,0.200098
std,3555184.0,861.147665,7.576041,1.353176,0.069098
min,10.00836,1.618784,0.611311,0.781864,0.06539
25%,67913.16,162.079612,4.561788,2.124231,0.165562
50%,311048.5,390.411753,10.404358,3.225579,0.221262
75%,2429045.0,727.681336,15.601187,3.946725,0.239282
max,12877200.0,3034.101198,23.010674,4.796944,0.303953


### Random Forest

In [66]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)

In [68]:
performace_on_test = {}
train_evaluate_and_save("rf", sample_files, data="test", on="test", model=rf, no_print=True, large=False)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [05:40<00:00, 22.70s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,2311479.0,569.501702,4.484829,1.887922,0.193206
std,5089915.0,547.711794,5.189652,0.993144,0.473411
min,37.13763,3.53758,0.713562,0.844726,-0.852574
25%,93615.09,182.859087,1.735264,1.317028,-0.014333
50%,540377.5,526.276826,2.678668,1.636664,0.11064
75%,2285987.0,791.333685,4.120348,2.028622,0.581855
max,20080990.0,2063.816129,16.895045,4.110358,0.852973


In [69]:
performace_on_train = {}
train_evaluate_and_save("rf", sample_files, data="train", on="train", model=rf, no_print=True, large=False)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [05:49<00:00, 23.28s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,1061514.0,434.532305,7.485994,2.530623,0.454605
std,1697516.0,486.283586,4.815403,1.076673,0.1535
min,9.090898,1.397402,0.511645,0.715294,0.183116
25%,48593.24,125.153569,3.514938,1.851915,0.389246
50%,192760.3,272.945944,8.709275,2.951148,0.445163
75%,1778446.0,537.354415,11.459412,3.384227,0.575474
max,5972714.0,1696.774924,13.357831,3.654837,0.673452


### XGBoost

In [71]:
xgbr = XGBRegressor(n_estimators=1000, max_depth=10)

In [73]:
performace_on_test = {}
train_evaluate_and_save("xgbr", sample_files, data="test", on="test", model=xgbr, no_print=True, large=False)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [03:21<00:00, 13.45s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,6067002.0,700.551601,4.472057,1.930714,-0.230158
std,18450030.0,843.923299,4.605227,0.89307,1.563951
min,40.7313,3.596807,1.158664,1.076413,-5.68621
25%,105021.6,191.948457,1.684837,1.297502,-0.195258
50%,581058.8,530.238249,3.154223,1.776013,0.084282
75%,2970035.0,871.913015,4.183369,2.044772,0.507328
max,72477300.0,3369.247564,15.708357,3.963377,0.831105


In [74]:
performace_on_train = {}
train_evaluate_and_save("xgbr", sample_files, data="train", on="train", model=xgbr, no_print=True, large=False)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [03:27<00:00, 13.84s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,931945.0,364.197096,6.70198,2.400056,0.500509
std,1509731.0,393.910759,4.233168,1.004479,0.168428
min,9.053522,1.393042,0.510545,0.714525,0.186475
25%,43948.32,111.176642,3.23191,1.764101,0.424801
50%,138853.0,228.351451,8.321409,2.884685,0.483685
75%,1709061.0,503.121995,9.732247,3.118878,0.64107
max,5425655.0,1410.23322,12.693816,3.562838,0.75585


It seems the models are not performin good enough on the *small* dataset. Next, we'll see if using the *large* dataset changes this.

## The *Large* Dataset

In [75]:
random.seed(42)
TRAIN_DIR = "data/large/train_large"
TEST_DIR = "data/large/test_large"

all_files = os.listdir(TRAIN_DIR)
sample_files = random.sample(all_files, 15)
sample_files

['A_BOOKS.csv',
 'B_LADIESWEAR.csv',
 'H_EGGS.csv',
 'L_POULTRY.csv',
 'G_HOME CARE.csv',
 'F_LADIESWEAR.csv',
 'A_HOME AND KITCHEN II.csv',
 'E_BEAUTY.csv',
 'O_LIQUOR,WINE,BEER.csv',
 'Q_HOME APPLIANCES.csv',
 'P_CLEANING.csv',
 'M_PET SUPPLIES.csv',
 'G_BABY CARE.csv',
 'A_PLAYERS AND ELECTRONICS.csv',
 'I_HOME AND KITCHEN I.csv']

### Linear Model

In [76]:
lr = LinearRegression()
performace_on_test = {}
train_evaluate_and_save("lr", sample_files, data="test", on="test", model=lr, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:00<00:00, 16.16it/s]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,31156.016606,71.365476,1.596567,0.918198,-0.161984
std,77980.417811,132.554349,3.056029,0.898498,0.964735
min,0.0,0.0,0.0,0.0,-2.830644
25%,18.382486,2.959447,0.254179,0.502207,-0.106281
50%,111.614703,7.763896,0.528386,0.726902,0.016556
75%,3524.990518,47.127334,0.866028,0.929508,0.127165
max,288172.938601,465.330789,10.117823,3.180853,1.0


In [77]:
performace_on_train = {}
train_evaluate_and_save("lr", sample_files, data="train", on="train", model=lr, no_print=True)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [00:01<00:00, 14.91it/s]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,11561.938703,38.924385,2.37657,1.202824,0.354442
std,23276.221883,62.417911,3.691862,0.998097,0.286141
min,0.0,0.0,0.0,0.0,0.00484
25%,6.696696,1.870894,0.430865,0.654525,0.208603
50%,68.910501,5.062088,0.998394,0.999197,0.280002
75%,2269.308054,35.257498,2.48522,1.573385,0.370634
max,66418.809861,173.180729,14.050695,3.748426,1.0


Okay, we are getting better result. Let's try some other models.

### The Random Forest

In [86]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)
performace_on_test = {}
train_evaluate_and_save("rf", sample_files, data="test", on="test", model=rf, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [01:15<00:00,  5.01s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,9968.89673,43.523844,1.111628,0.841592,-0.01696
std,20108.455295,65.830668,1.729493,0.65739,0.766834
min,0.0,0.0,0.0,0.0,-2.046899
25%,20.595052,3.012009,0.332618,0.576454,-0.119413
50%,115.195531,7.536792,0.573305,0.757169,-0.034446
75%,3411.787172,44.662401,0.818019,0.904442,0.472006
max,64716.101791,186.827537,5.720232,2.391701,1.0


In [87]:
rf = RandomForestRegressor(n_estimators=1000, max_depth=10, random_state=42)
performace_on_train = {}
train_evaluate_and_save("rf", sample_files, data="train", on="train", model=rf, no_print=True)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [01:17<00:00,  5.15s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,4568.740911,24.709947,1.165115,0.789872,0.639216
std,9323.191939,39.109337,1.912266,0.761496,0.201735
min,0.0,0.0,0.0,0.0,0.393078
25%,4.862756,1.474496,0.122756,0.34956,0.472793
50%,52.403624,3.870721,0.368265,0.606849,0.611049
75%,1572.876187,29.079232,0.465209,0.681927,0.765063
max,26840.686955,125.325199,5.431083,2.330468,1.0


### The XGBoost

In [106]:
xgbr = XGBRegressor(n_estimators=1000, max_depth=5)

In [107]:
performace_on_test = {}
train_evaluate_and_save("xgbr", sample_files, data="test", on="test", model=xgbr, no_print=True)
pd.DataFrame(performace_on_test).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [01:15<00:00,  5.04s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,13054.41,45.34583,1.019564,0.8349887,-0.352443
std,26773.26,66.35634,1.374298,0.5876934,1.090654
min,1.963637e-90,1.401298e-45,1.963637e-90,1.401298e-45,-4.080264
25%,20.31117,3.061301,0.3428402,0.585502,-0.297274
50%,138.508,8.151645,0.6475291,0.8046919,-0.125066
75%,4905.504,52.42832,0.8457056,0.9195562,0.003602
max,90320.08,179.9161,4.412108,2.100502,0.575567


In [108]:
performace_on_train = {}
train_evaluate_and_save("xgbr", sample_files, data="train", on="train", model=xgbr, no_print=True)
pd.DataFrame(performace_on_train).T.describe()

Training and Evaluating: 100%|██████████| 15/15 [01:15<00:00,  5.00s/it]


Unnamed: 0,mse,mae,msle,rmse,r2
count,15.0,15.0,15.0,15.0,15.0
mean,1748.848,10.16915,0.7366152,0.5777844,0.601976
std,5897.34,20.126,1.462354,0.6569249,0.316004
min,1.963637e-90,1.401298e-45,1.963637e-90,1.401298e-45,0.0
25%,2.732988,0.9550996,0.01551947,0.1195218,0.467409
50%,9.015534,1.76947,0.2523036,0.5022983,0.608614
75%,206.0753,7.7789,0.3990347,0.6316047,0.826786
max,22953.8,74.75575,4.704983,2.169097,0.999977


Okay, let's make predictions on the test dataset.

In [109]:
df = train_and_predict(xgbr, all_files[0])
for file in tqdm.tqdm(all_files[1:], desc="Predicting..."):
    df_temp = train_and_predict(rf, file)
    df = pd.concat([df, df_temp])

Predicting...: 100%|██████████| 560/560 [55:24<00:00,  5.94s/it]  


In [110]:
df


Unnamed: 0,id,sales
0,3001150,82.642387
1,3001678,82.642387
2,3001744,82.642387
3,3001777,82.642387
4,3001843,82.642387
...,...,...
43,3027537,331.066005
44,3027570,331.077897
45,3028362,393.053604
46,3029319,316.698405


In [112]:
df_ordered = df.sort_values(by="id")
df_ordered

Unnamed: 0,id,sales
0,3000888,6.061122
0,3000889,0.088900
0,3000890,5.591964
0,3000891,2525.394851
0,3000892,0.000000
...,...,...
95,3029395,260.838431
95,3029396,61.289081
95,3029397,963.587138
95,3029398,28.548445


In [113]:
df_ordered["sales"] = df_ordered["sales"].apply(lambda x: np.abs(x))

In [114]:
df_ordered.describe()


Unnamed: 0,id,sales
count,28512.0,28512.0
mean,3015144.0,444.403082
std,8230.85,1161.606446
min,3000888.0,0.0
25%,3008016.0,4.007548
50%,3015144.0,26.670612
75%,3022271.0,273.915931
max,3029399.0,13034.466247


In [118]:
df_ordered.to_csv("submission.csv", index=False)

In [None]:
!kaggle competitions submit -c store-sales-time-series-forecasting -f data/submissions/submission_4.csv -m "Submission 4"

Great!