In [1]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Imports Needed to run Notebook

In [2]:
#Imports for data wrangling 
import cudf
import cupy as cp
import pandas
import numpy
# import warnings
# warnings.filterwarnings('ignore')

In [3]:
#Importing given data
train = cudf.read_csv("./input/train.csv", parse_dates=['date'])

test = cudf.read_csv("./input/test.csv", parse_dates=['date'])

oil = cudf.read_csv("./input/oil.csv", parse_dates=['date'])

holiday = cudf.read_csv("./input/holidays_events.csv")

store = cudf.read_csv("./input/stores.csv")

## Feature Engineering


### Capturing Seasonal Holiday Effect

In [4]:
# Converting dates to datetime
holiday["date"] = cudf.to_datetime(holiday["date"], format='%Y-%m-%d')
holiday = holiday.set_index("date")

# Keeping only celbrated holidays
holiday = holiday.loc[(holiday["transferred"]!=True)].drop("transferred", axis=1)
holiday.loc[holiday["type"]=="Transfer", "type"] = "Holiday"

# Bridged days are day where there is no work
bridge = holiday.loc[holiday["type"]=="Bridge"]
bridge["bridge"] = True
bridge = bridge[["bridge"]]

# Special events
event = holiday.loc[holiday["type"]=="Event"][["description"]]

# Keeping only holidays
holiday = holiday.loc[holiday["type"]=="Holiday"]

# Holidays celerbated localy 
loc_hol = holiday.loc[holiday["locale"]=="Local"][["locale_name", "description"]]

# Holidays celerbrated regionally
reg_hol = holiday.loc[holiday["locale"]=="Regional"][["locale_name", "description"]]

#Holidays celberbrated nationally
nat_hol = holiday.loc[holiday["locale"]=="National"][["description"]]

# Recording days Earthquake
quake = event.loc[event["description"].str.find("Terremoto Manabi")!=-1]
quake["time_since_quake"] = cp.arange(1,len(quake.index)+1)
quake.drop("description", axis=1, inplace=True)

# Removing Earthquake and adding Sporting Events
event = event.loc[event["description"].str.find("Terremoto Manabi")==-1]
event.loc[event["description"].str.find("futbol")!=-1, "description"]= "Sports"

### Location Specific Demand

In [5]:
# Ensure proper format
train["store_nbr"] = train["store_nbr"].astype(int)

# Merging
X = train.merge(store, on="store_nbr", how="left")
X.drop("cluster", axis=1, inplace=True)

# Converting dates to datetime
X["date"] = cudf.to_datetime(X["date"], format='%Y-%m-%d')

# Creating feature measuring the total in store promotions.
total_other_promo_store = X[["date", "store_nbr", "onpromotion"]].groupby(['date', 'store_nbr']).sum()["onpromotion"].reset_index()
total_other_promo_store = total_other_promo_store.rename(columns={'onpromotion': 'total_other_promo_store',})

# Creating feature measuring the total promotions in each town for similar products.
total_other_city_promo = X[["date", "onpromotion", "family", "city"]].groupby(['date', 'city', 'family']).sum()["onpromotion"].reset_index()
total_other_city_promo = total_other_city_promo.rename(columns={'onpromotion': 'total_other_city_promo',})

# Adding new features
X = X.merge(total_other_promo_store, on=['date', 'store_nbr'], how="left")
X = X.merge(total_other_city_promo, on=['date', 'city', 'family'], how="left")

In [6]:
# Ensure proper format
store["store_nbr"] = store["store_nbr"].astype(int)
test["store_nbr"] = test["store_nbr"].astype(int)

# Merging
X_test = test.merge(store, on="store_nbr", how="left")
X_test.drop("cluster", axis=1, inplace=True)

# Converting dates to datetime
X_test["date"] = cudf.to_datetime(X_test["date"], format='%Y-%m-%d')

# Creating feature measuring the total in store promotions.
total_other_promo_store = X_test[["date", "store_nbr", "onpromotion"]].groupby(['date', 'store_nbr']).sum()["onpromotion"].reset_index()
total_other_promo_store = total_other_promo_store.rename(columns={'onpromotion': 'total_other_promo_store',})

# Creating feature measuring the total promotions in each town for similar products.
total_other_city_promo = X_test[["date", "onpromotion", "family", "city"]].groupby(['date', 'city', 'family']).sum()["onpromotion"].reset_index()
total_other_city_promo = total_other_city_promo.rename(columns={'onpromotion': 'total_other_city_promo',})

# Adding new features
X_test = X_test.merge(total_other_promo_store, on=['date', 'store_nbr'], how="left")
X_test = X_test.merge(total_other_city_promo, on=['date', 'city', 'family'], how="left")

In [7]:
X = X.set_index("date")
X_test = X_test.set_index("date")

### Merging with Holidays

In [8]:
# Adding national holidays
X = X.merge(nat_hol, on="date", how="left")

# Bridge days
X = X.merge(bridge, on="date", how="left")

# Adding local holdays
X = X.merge(loc_hol, left_on=["date", "city"],
            right_on=["date", "locale_name"],
            suffixes=(None, '_l'), how="left"
           )
X.drop("locale_name", axis=1, inplace=True)

# Adding regional holidays
X = X.merge(reg_hol, left_on=["date", "state"],
            right_on=["date", "locale_name"], 
            suffixes=(None, '_r'),how="left"
           )
X.drop("locale_name", axis=1, inplace=True)

# True if holiday that Day
X["holiday"] = (((X["descriptionNone"].isnull()==False) | (X["description_l"].isnull()==False)) | (X["description"].isnull()==False))

# Combine Holiday descriptions
X.drop("descriptionNone", axis=1, inplace=True)
X.drop("description_l", axis=1, inplace=True)
X.drop("description", axis=1, inplace=True)

#Events
X = X.merge(event, on="date", how="left")
X = X.rename(columns={'description': 'event',})
X["event"] = X["event"].fillna("none")

# Adding Quake data
X = X.merge(quake, on="date", how="left")
X["time_since_quake"] = X["time_since_quake"].fillna(0)

#To model a diminishing marginal effect on the economy by the earthquake
X["time_since_quake_sq"] = X["time_since_quake"]**2

In [9]:
# Adding national holidays
X_test = X_test.merge(nat_hol, on="date", how="left")
del nat_hol

# Bridge days
X_test = X_test.merge(bridge, on="date", how="left")
del bridge

# Adding local holdays
X_test = X_test.merge(loc_hol, left_on=["date", "city"],
            right_on=["date", "locale_name"],
            suffixes=(None, '_l'), how="left"
           )
X_test.drop("locale_name", axis=1, inplace=True)
del loc_hol

# Adding regional holidays
X_test = X_test.merge(reg_hol, left_on=["date", "state"],
            right_on=["date", "locale_name"], 
            suffixes=(None, '_r'),how="left"
           )
X_test.drop("locale_name", axis=1, inplace=True)
del reg_hol

# True if holiday that Day
X_test["holiday"] = (((X_test["descriptionNone"].isnull()==False) | (X_test["description_l"].isnull()==False)) | (X_test["description"].isnull()==False))

# Combine Holiday descriptions
X_test.drop("descriptionNone", axis=1, inplace=True)
X_test.drop("description_l", axis=1, inplace=True)
X_test.drop("description", axis=1, inplace=True)

#Events
X_test = X_test.merge(event, on="date", how="left")
X_test = X_test.rename(columns={'description': 'event',})
X_test["event"] = X_test["event"].fillna("none")
del event

# Adding Quake data
X_test = X_test.merge(quake, on="date", how="left")
X_test["time_since_quake"] = X_test["time_since_quake"].fillna(0)
del quake

#To model a diminishing marginal effect on the economy by the earthquake
X_test["time_since_quake_sq"] = X_test["time_since_quake"]**2

### Merging with Oil Prices

In [10]:
oil["date"] = cudf.to_datetime(oil["date"], format='%Y-%m-%d')
oil = oil.set_index("date")
X = X.merge(oil, on="date", how="left")
X_test = X_test.merge(oil, on="date", how="left")

del oil

# There is no price of oil on days that the market is closed.
# To fill the price, we first fill with the last value.
X["dcoilwtico"]= X["dcoilwtico"].ffill()
X_test["dcoilwtico"]= X_test["dcoilwtico"].ffill()

# We back fill just for first couple values that are empty.
X["dcoilwtico"]= X["dcoilwtico"].bfill()
X_test["dcoilwtico"]=X_test["dcoilwtico"].bfill()

# I just to do a rolling average to smooth out any problems with the empty values,
# and to capture any effect of changes. 
X["dcoilwtico"] = X["dcoilwtico"].rolling(
    window=30,       
    min_periods=1,  
).mean()

X_test["dcoilwtico"] = X_test["dcoilwtico"].rolling(
    window=30,       
    min_periods=1,  
).mean()

### Time Based Varriables

In [11]:
# Time variables
X["day"] = X.index.dayofweek
X["dayofyear"] = X.index.dayofyear
X["month"] = X.index.month
X["year"] = X.index.year

# This varible says whether it is a workday.
X["workday"] = (((X.bridge.isnull()) & (X.holiday==False)) & ((X["day"]!=5) & (X["day"]!=6)))
X.drop("bridge", axis=1, inplace=True)

# In Ecudor, people get paid on the 15 and the last day of the month
X["payday"] = ((X.index.day==15) | (X.index.day==X.index.to_series().dt.days_in_month)) 

In [12]:
# Time variables
X_test["day"] = X_test.index.dayofweek
X_test["dayofyear"] =X_test.index.dayofyear
X_test["month"] = X_test.index.month
X_test["year"] = X_test.index.year

# This varible says whether it is a workday.
X_test["workday"] = (((X_test.bridge.isnull()) & (X_test.holiday==False)) & ((X_test["day"]!=5) & (X_test["day"]!=6)))
X_test.drop("bridge", axis=1, inplace=True)

# In Ecudor, people get paid on the 15 and the last day of the month
X_test["payday"] = ((X_test.index.day==15) | (X_test.index.day==X_test.index.to_series().dt.days_in_month)) 

### Final Dataframe

In [13]:
# Fixing data type
X_test = X_test.reset_index()
X_test = X_test.sort_values(["store_nbr", "family", "date"])
X_test = X_test.set_index("date")

X_test["family"] = X_test["family"].astype('category').cat.codes
X_test["store_nbr"] = X_test["store_nbr"].astype('int')
X_test["holiday"] = X_test["holiday"].astype('category').cat.codes
X_test["event"] = X_test["event"].astype('category').cat.codes
X_test["city"] = X_test["city"].astype('category').cat.codes
X_test["state"] = X_test["state"].astype('category').cat.codes
X_test["type"] = X_test["type"].astype('category').cat.codes

In [14]:
X = X.reset_index()
X = X.sort_values(["store_nbr", "family", "date"])
X = X.set_index("date")

X["family"] = X["family"].astype('category').cat.codes
X["store_nbr"] = X["store_nbr"].astype('int')
X["holiday"] = X["holiday"].astype('category').cat.codes
X["event"] = X["event"].astype('category').cat.codes
X["city"] = X["city"].astype('category').cat.codes
X["state"] = X["state"].astype('category').cat.codes
X["type"] = X["type"].astype('category').cat.codes

y = X[["store_nbr", "family", "sales"]]
X.drop("sales", axis=1, inplace=True)

X.head()

Unnamed: 0_level_0,id,store_nbr,family,onpromotion,city,state,type,total_other_promo_store,total_other_city_promo,holiday,event,time_since_quake,time_since_quake_sq,dcoilwtico,day,dayofyear,month,year,workday,payday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2013-01-01,0,1,0,0,18,12,3,0,0,1,4,0,0,93.14,1,1,1,2013,False,False
2013-01-02,1782,1,0,0,18,12,3,0,0,0,4,0,0,93.14,2,2,1,2013,True,False
2013-01-03,3564,1,0,0,18,12,3,0,0,0,4,0,0,93.046,3,3,1,2013,True,False
2013-01-04,5346,1,0,0,18,12,3,0,0,0,4,0,0,93.334,4,4,1,2013,True,False
2013-01-05,7128,1,0,0,18,12,3,0,0,0,4,0,0,93.456,5,5,1,2013,False,False


In [15]:
# Removing early time with NaNs
X = X.loc[X.index >= "2015-07-01"]
y = y.loc[y.index >= "2015-07-01"]

## Trainning Model
###  Imports

In [16]:
#Custom Time series functions
#from jons_time_series_functions import Prepare_data, Hybrid_Time_Series_ML, Hybrid_Pipeline

# Data Preprocessing 
# from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

# Cross-Validation
from sklearn.model_selection import TimeSeriesSplit

# Models
# from sklearn.dummy import DummyRegressor
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, mean_squared_log_error
# from xgboost import XGBRegressor

In [17]:
# Data Preprocessing 
from cuml.dask.preprocessing import OneHotEncoder, LabelEncoder
from cuml.preprocessing import MinMaxScaler, StandardScaler, SimpleImputer, LabelEncoder, OneHotEncoder
from cuml.compose import make_column_transformer
from statsmodels.tsa.deterministic import CalendarFourier

# Cross-Validation
from sklearn.model_selection import TimeSeriesSplit

# Models
from sklearn.dummy import DummyRegressor
from cuml.linear_model import LinearRegression
from cuml.metrics import mean_squared_error, mean_squared_log_error
from bayes_opt import BayesianOptimization
from xgboost import XGBRegressor

# Torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from jons_time_series_functions import DeterministicProcess_gpu

In [18]:
#This line is what will tell us if we did our instalation correct
print(f'Cuda {"is" if torch.cuda.is_available() else "is not"} available on this device.')
print()

# If available, this code will proceed to give the deivice specifications. 
if torch.cuda.is_available():
    
    print(f'PyTorch on this device is running version {torch.version.cuda} Cuda')
    print(f'The number of GPUs on this device is {torch.cuda.device_count()}')
    num = torch.cuda.device_count()
    
    for i in range(0, num):
        print()
        print(f'GPU #{i+1}: In our device, we have a {torch.cuda.get_device_name(f"cuda:{i}")}. It')
        print(f'has a total memory of {"{:,.0f}".format(torch.cuda.get_device_properties(f"cuda:{i}").total_memory)} and {"{:,.0f}".format(torch.cuda.get_device_properties(f"cuda:{i}").multi_processor_count)} multi-processors.')
        
#Defining which device will be used
if torch.cuda.is_available(): 
    dev = "cuda" 
else: 
    dev = "cpu" 
    
#Set device
device = torch.device(dev) 

Cuda is available on this device.

PyTorch on this device is running version 12.0 Cuda
The number of GPUs on this device is 1

GPU #1: In our device, we have a NVIDIA GeForce RTX 4070. It
has a total memory of 12,878,086,144 and 46 multi-processors.


### Trainning

In [19]:
# Define the preprocessing steps
numeric_transformer = [['dcoilwtico', 'onpromotion', 'total_other_promo_store', 'total_other_city_promo'], StandardScaler()]
categorical_transformer = [['event'], OneHotEncoder(sparse=False, handle_unknown='error', drop="first")]

column_list = ["time_since_quake", "time_since_quake_sq"]

In [20]:
import numpy as np

In [21]:
class ColTransformer_gpu:
    def __init__(self, transformers):
        self.transformers = transformers
    
    def fit_transform(self, X):
        # Initialize an empty list to store transformed features
        X = X.copy()
        transformed_features = []
        
        num = 0
        
        # Iterate over each transformer and apply fit_transform
        for name, transformer in self.transformers:
            X_transformed = cudf.DataFrame(transformer.fit_transform(X[name]))

            prev_num = num
            num += len(X_transformed.columns)
            
            # Rename columns to ensure uniqueness
            X_transformed.columns = [f"{i}" for i in range(prev_num, num)]
            
            transformed_features.append(X_transformed)  # Append transformed feature DataFrame

            X  = X.drop(name, axis=1)

        transformed_features.append(X)
        
        # Concatenate transformed features horizontally
        X_transformed_concat = cudf.concat(transformed_features, axis=1)
        
        
        return X_transformed_concat
    
    def transform(self, X):
        X = X.copy()
        # Initialize an empty list to store transformed features
        transformed_features = []
        
        num = 0
        
        # Iterate over each transformer and apply transform
        for name, transformer in self.transformers:
            X_transformed = cudf.DataFrame(transformer.transform(X[name]))

            prev_num = num
            num += len(X_transformed.columns)
            
            # Rename columns to ensure uniqueness
            X_transformed.columns = [f"{i}" for i in range(prev_num, num)]
            
            transformed_features.append(X_transformed)  # Append transformed feature DataFrame
            
            X  = X.drop(name, axis=1)

        transformed_features.append(X)
        
        # Concatenate transformed features horizontally
        X_transformed_concat = cudf.concat(transformed_features, axis=1)
        
        return X_transformed_concat

# Define transformers for numerical and categorical features
numeric_transformer = ['dcoilwtico', 'onpromotion', 'total_other_promo_store', 'total_other_city_promo'], StandardScaler()
categorical_transformer = ['event'], OneHotEncoder(sparse=False, handle_unknown='error', drop="first")

# Initialize and apply the ColTransformer_gpu
ct = ColTransformer_gpu([numeric_transformer, categorical_transformer])

In [22]:
# # Define the preprocessing steps
# numeric_transformer = StandardScaler()
# categorical_transformer = OneHotEncoder(sparse=False, handle_unknown='error', drop="first")

In [23]:
train_size = int(len(X.index.drop_duplicates()) * 2 / 3)
test_size = len(X.index.drop_duplicates()) - train_size

date_train, date_test = X.index.drop_duplicates()[:train_size], X.index.drop_duplicates()[train_size:]
X_train, X_test = X.loc[date_train], X.loc[date_test]

In [24]:
# dpg = DeterministicProcess_gpu(1, fourier = ["W", 2])
# X_1_train = dpg.in_sample(X_train[column_list][~X_train.index.duplicated(keep='last')])
# X_1_test = dpg.out_of_sample(X_test[column_list][~X_test.index.duplicated(keep='last')])

In [25]:
# dpg = DeterministicProcess_gpu(1, fourier = ["W", 2])
# X_1_train = dpg.in_sample(X_train[column_list][~X_train.index.duplicated(keep='last')])
# X_1_test = dpg.out_of_sample(X_test[column_list][~X_test.index.duplicated(keep='last')])

# a=X_train.copy()
# X_2_train = a.drop(column_list+["id"], axis=1)
# X_2_train = X_2_train.reset_index().drop(["date", "store_nbr", "family", "state", "city", "type", "dayofyear", "year"], axis=1)

# X_2_train = ct.fit_transform(X_2_train)

# a=X_test.copy()
# X_2_test = a.drop(column_list+["id"], axis=1)
# X_2_test = X_2_test.reset_index().drop(["date", "store_nbr", "family", "state", "city", "type", "dayofyear", "year"], axis=1)

# X_2_test = ct.transform(X_2_test)

In [26]:
class Prepare_data:
    '''
    A Class for preparing data for Hybrid Models. 
    
    Parameters:
        X_1_column: Columns to be used in the 1st Machine Learning model,
        unwanted_columns: Columns to not be used for the 2nd Machine Learning Model.
                 
    Attributes:
        X_1(X): Creates data for the first model,
        X_2(X): Creates data for the second model,
        preprocessor_2: Preprossesing for data of type oj=bject, catergory, and ,
        transform(X): Transform data performing X_1(X) and X_2(X)  
    '''
    
    def __init__(self, X_1_column, transformer_list, unwanted_columns=[], to_tensor=False):
        '''
        Initializes the Prepare_data class.
        
        Parameters:
            X_1_column: Columns to be used in the 1st Machine Learning model,
            unwanted_columns: Columns to not be used for the 2nd Machine Learning Model.
        '''
        
        #Defining instance variables
        self.column_list = X_1_column
        self.unwanted_columns = unwanted_columns
        self.to_tensor = to_tensor
        self.ct = ColTransformer_gpu(transformer_list)
        self.dpg = DeterministicProcess_gpu(1, fourier = ["W", 2])
        
    def X_1_fit_transform(self, X): 
        '''
        Transform data into X_1. Expects linear 1st model so uses Deterministic Process. 
        
        Parameters:
            X: Data for the model. 
        '''
        
        # return deterministic process
        return dpg.in_sample(X[self.column_list][~X.index.duplicated(keep='last')])

    def X_1_transform(self, X): 
        '''
        Transform data into X_1. Expects linear 1st model so uses Deterministic Process. 
        
        Parameters:
            X: Data for the model. 
        '''
        
        return dpg.out_of_sample(X[self.column_list][~X.index.duplicated(keep='last')])

    def X_2_fit_transform(self, X):
        '''
        Transform data into X_2.
        
        Parameters:
            X: Data for the model. 
        '''
        X_2 = X.reset_index()
        X_2 = X_2.drop(self.column_list + self.unwanted_columns + ["date"], axis=1)
        
        X_2 = self.ct.fit_transform(X_2)
        return X_2
        
    def X_2_transform(self, X):
        '''
        Transform data into X_2.
        
        Parameters:
            X: Data for the model. 
        '''
        X_2 = X.reset_index()
        X_2 = X_2.drop(self.column_list + self.unwanted_columns + ["date"], axis=1)
        X_2 = self.ct.transform(X_2)
        
        return X_2
    
    def fit_transform(self, X):
        '''
        Transform data into X_1 and X_2.
        
        Parameters:
            X: Data for the model. 
        '''
        X_1, X_2 = self.X_1_fit_transform(X), self.X_2_fit_transform(X)
        if self.to_tensor:
            
            X_1 = torch.tensor(X_1.values).to(device)
            X_2 = torch.from_numpy(X_2.astype("float")).to(device)
        
        return X_1, X_2
        
    def transform(self, X, to_tensor=False):
        '''
        Transform data into X_1 and X_2.
        
        Parameters:
            X: Data for the model.
            to_tensor: If true changes data to tensors on the GPU.
        '''
        X_1, X_2 = self.X_1_transform(X), self.X_2_transform(X)
        
        if self.to_tensor:
            
            X_1 = torch.tensor(X_1.values).to(device)
            X_2 = torch.from_numpy(X_2.astype("float")).to(device)
            
        return X_1, X_2


In [27]:
pd = Prepare_data(column_list, [numeric_transformer, categorical_transformer])

In [28]:
a = X_train.copy().drop(["id", "store_nbr", "family", "state", "city", "type", "dayofyear", "year"], axis=1).copy()

X_1_train, X_2_train = pd.fit_transform(a)

a2 = X_2_test.copy().drop(["id", "store_nbr", "family", "state", "city", "type", "dayofyear", "year"], axis=1)

X_1_train, X_2_test = pd.transform(a2)

NameError: name 'dpg' is not defined

In [None]:
X_train.reset_index()

In [None]:
 # Model: Use the average y value for each catergory from the training data as the predicted y value for the test data.
hts = Hybrid_Time_Series_ML(DummyRegressor(strategy="mean"), DummyRegressor(strategy="constant", constant=0))
data_prep = Prepare_data(column_list, preprocessor)
hp = Hybrid_Pipeline(data_prep, hts)

# Use time series split for cross validation. 
cv_split = TimeSeriesSplit(n_splits = 4)

# Create lists to append MSE scores. 
train_msle = []
valid_msle = []

# Dates to index through. 
dates = X.index.drop_duplicates()
a = 0
# Perform Cross-Validation to determine how model will do on unseen data.
for train_index, valid_index in cv_split.split(dates):
    a = a+1
    print(f"Fold {a}:") 
    model = Hybrid_Pipeline(Prepare_data(column_list, preprocessor), hts)
    
    # Index dates.
    date_train, date_valid = dates[train_index], dates[valid_index]

    # Selecting data for y_train and y_valid.
    y_train = y.loc[date_train]
    y_valid = y.loc[date_valid]

    # Selecting data for X_train and X_valid.
    X_train = X.loc[date_train]
    X_valid = X.loc[date_valid]


    # Fitting model.
    model.fit(X_train, y_train)

    # Create predictions for Trainning and Validation.
    fit = model.predict(X_train)
    pred = model.predict(X_valid)
    
    # MSE for trainning and validation. 
    train_msle.append(mean_squared_log_error(y_train["sales"], fit))
    valid_msle.append(mean_squared_log_error(y_valid["sales"], pred))
    
    print(f"Training RMSLE: {cp.sqrt(mean_squared_log_error(y_train.sales, fit)):.3f}, Validation RMSLE: {cp.sqrt(mean_squared_log_error(y_valid.sales, pred)):.3f}")

# Returns the square root of the average of the MSE.
print("Average Across Folds")
print(f"Training RMSLE:{cp.sqrt(cp.mean(train_msle)):.3f}, Validation RMSLE: {cp.sqrt(cp.mean(valid_msle)):.3f}")

In [None]:
# Fit Model
hp.fit(X, y)

## Final Predictions and Submission 

In [None]:
# Generate Predictions
test_id = X_test[["family", "store_nbr", "id"]]
pred = hp.predict(X_test)
pred = pred.reset_index()
pred = test_id.merge(pred, on=["date", "store_nbr", "family"])
pred = pred[["id", "sales"]]
pred = pred.set_index("id")
pred.head()

In [None]:
pred.to_csv('submission.csv', index=True)

In [None]:
# api.competition_submit('submission.csv','1st API Submission','store-sales-time-series-forecasting')