In [1]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

## Imports Needed to run Notebook

In [2]:
#Imports for data wrangling 
import cudf
import cupy as cp
import pandas as pd
import numpy as np
# import warnings
# warnings.filterwarnings('ignore')

In [3]:
#Importing given data
train = cudf.read_csv("./input/train.csv", parse_dates=['date'])

test = cudf.read_csv("./input/test.csv", parse_dates=['date'])

oil = cudf.read_csv("./input/oil.csv", parse_dates=['date'])

holiday = cudf.read_csv("./input/holidays_events.csv")

store = cudf.read_csv("./input/stores.csv")

## Feature Engineering


### Capturing Seasonal Holiday Effect

In [4]:
# Converting dates to datetime
holiday["date"] = cudf.to_datetime(holiday["date"], format='%Y-%m-%d')
holiday = holiday.set_index("date")

# Keeping only celbrated holidays
holiday = holiday.loc[(holiday["transferred"]!=True)].drop("transferred", axis=1)
holiday.loc[holiday["type"]=="Transfer", "type"] = "Holiday"

# Bridged days are day where there is no work
bridge = holiday.loc[holiday["type"]=="Bridge"]
bridge["bridge"] = True
bridge = bridge[["bridge"]]

# Special events
event = holiday.loc[holiday["type"]=="Event"][["description"]]

# Keeping only holidays
holiday = holiday.loc[holiday["type"]=="Holiday"]

# Holidays celerbated localy 
loc_hol = holiday.loc[holiday["locale"]=="Local"][["locale_name", "description"]]

# Holidays celerbrated regionally
reg_hol = holiday.loc[holiday["locale"]=="Regional"][["locale_name", "description"]]

#Holidays celberbrated nationally
nat_hol = holiday.loc[holiday["locale"]=="National"][["description"]]

# Recording days Earthquake
quake = event.loc[event["description"].str.find("Terremoto Manabi")!=-1]
quake["time_since_quake"] = cp.arange(1,len(quake.index)+1)
quake.drop("description", axis=1, inplace=True)

# Removing Earthquake and adding Sporting Events
event = event.loc[event["description"].str.find("Terremoto Manabi")==-1]
event.loc[event["description"].str.find("futbol")!=-1, "description"]= "Sports"

In [5]:
bridge

Unnamed: 0_level_0,bridge
date,Unnamed: 1_level_1
2012-12-24,True
2012-12-31,True
2014-12-26,True
2015-01-02,True
2016-11-04,True


### Location Specific Demand

In [6]:
# Ensure proper format
train["store_nbr"] = train["store_nbr"].astype(int)

# Merging
X = train.merge(store, on="store_nbr", how="left")
X.drop("cluster", axis=1, inplace=True)

# Converting dates to datetime
X["date"] = cudf.to_datetime(X["date"], format='%Y-%m-%d')

# Creating feature measuring the total in store promotions.
total_other_promo_store = X[["date", "store_nbr", "onpromotion"]].groupby(['date', 'store_nbr']).sum()["onpromotion"].reset_index()
total_other_promo_store = total_other_promo_store.rename(columns={'onpromotion': 'total_other_promo_store',})

# Creating feature measuring the total promotions in each town for similar products.
total_other_city_promo = X[["date", "onpromotion", "family", "city"]].groupby(['date', 'city', 'family']).sum()["onpromotion"].reset_index()
total_other_city_promo = total_other_city_promo.rename(columns={'onpromotion': 'total_other_city_promo',})

# Adding new features
X = X.merge(total_other_promo_store, on=['date', 'store_nbr'], how="left")
X = X.merge(total_other_city_promo, on=['date', 'city', 'family'], how="left")

In [7]:
# Ensure proper format
store["store_nbr"] = store["store_nbr"].astype(int)
test["store_nbr"] = test["store_nbr"].astype(int)

# Merging
X_test = test.merge(store, on="store_nbr", how="left")
X_test.drop("cluster", axis=1, inplace=True)

# Converting dates to datetime
X_test["date"] = cudf.to_datetime(X_test["date"], format='%Y-%m-%d')

# Creating feature measuring the total in store promotions.
total_other_promo_store = X_test[["date", "store_nbr", "onpromotion"]].groupby(['date', 'store_nbr']).sum()["onpromotion"].reset_index()
total_other_promo_store = total_other_promo_store.rename(columns={'onpromotion': 'total_other_promo_store',})

# Creating feature measuring the total promotions in each town for similar products.
total_other_city_promo = X_test[["date", "onpromotion", "family", "city"]].groupby(['date', 'city', 'family']).sum()["onpromotion"].reset_index()
total_other_city_promo = total_other_city_promo.rename(columns={'onpromotion': 'total_other_city_promo',})

# Adding new features
X_test = X_test.merge(total_other_promo_store, on=['date', 'store_nbr'], how="left")
X_test = X_test.merge(total_other_city_promo, on=['date', 'city', 'family'], how="left")

In [8]:
X = X.set_index("date")
X_test = X_test.set_index("date")

### Merging with Holidays

In [9]:
# Adding national holidays
X = X.merge(nat_hol, on="date", how="left")

# Bridge days
X = X.merge(bridge, on="date", how="left")

# Adding local holdays
X = X.merge(loc_hol, left_on=["date", "city"],
            right_on=["date", "locale_name"],
            suffixes=(None, '_l'), how="left"
           )
X.drop("locale_name", axis=1, inplace=True)

# Adding regional holidays
X = X.merge(reg_hol, left_on=["date", "state"],
            right_on=["date", "locale_name"], 
            suffixes=(None, '_r'),how="left"
           )
X.drop("locale_name", axis=1, inplace=True)

# True if holiday that Day
X["holiday"] = (((X["descriptionNone"].isnull()==False) | (X["description_l"].isnull()==False)) | (X["description"].isnull()==False))

X["holiday_description"] = X['descriptionNone'].fillna('') + X['description_l'].fillna('') + X['description'].fillna('')

# Combine Holiday descriptions
X.drop("descriptionNone", axis=1, inplace=True)
X.drop("description_l", axis=1, inplace=True)
X.drop("description", axis=1, inplace=True)

#Events
X = X.merge(event, on="date", how="left")
X = X.rename(columns={'description': 'event',})
X["event"] = X["event"].fillna("none")

# Adding Quake data
X = X.merge(quake, on="date", how="left")
X["time_since_quake"] = X["time_since_quake"].fillna(0)

#To model a diminishing marginal effect on the economy by the earthquake
X["time_since_quake_sq"] = X["time_since_quake"]**2

In [10]:
# Adding national holidays
X_test = X_test.merge(nat_hol, on="date", how="left")
del nat_hol

# Bridge days
X_test = X_test.merge(bridge, on="date", how="left")
del bridge

# Adding local holdays
X_test = X_test.merge(loc_hol, left_on=["date", "city"],
            right_on=["date", "locale_name"],
            suffixes=(None, '_l'), how="left"
           )
X_test.drop("locale_name", axis=1, inplace=True)
del loc_hol

# Adding regional holidays
X_test = X_test.merge(reg_hol, left_on=["date", "state"],
            right_on=["date", "locale_name"], 
            suffixes=(None, '_r'),how="left"
           )
X_test.drop("locale_name", axis=1, inplace=True)
del reg_hol

# True if holiday that Day
X_test["holiday"] = (((X_test["descriptionNone"].isnull()==False) | (X_test["description_l"].isnull()==False)) | (X_test["description"].isnull()==False))

X_test["holiday_description"] = X_test['descriptionNone'].fillna('') + X_test['description_l'].fillna('') + X_test['description'].fillna('')

# Combine Holiday descriptions
X_test.drop("descriptionNone", axis=1, inplace=True)
X_test.drop("description_l", axis=1, inplace=True)
X_test.drop("description", axis=1, inplace=True)

#Events
X_test = X_test.merge(event, on="date", how="left")
X_test = X_test.rename(columns={'description': 'event',})
X_test["event"] = X_test["event"].fillna("none")
del event

# Adding Quake data
X_test = X_test.merge(quake, on="date", how="left")
X_test["time_since_quake"] = X_test["time_since_quake"].fillna(0)
del quake

#To model a diminishing marginal effect on the economy by the earthquake
X_test["time_since_quake_sq"] = X_test["time_since_quake"]**2

### Merging with Oil Prices

In [11]:
oil["date"] = cudf.to_datetime(oil["date"], format='%Y-%m-%d')
oil = oil.set_index("date")
X = X.merge(oil, on="date", how="left")
X_test = X_test.merge(oil, on="date", how="left")

del oil

# There is no price of oil on days that the market is closed so we interpolate to get next value.
X["dcoilwtico"]= X["dcoilwtico"].interpolate(method="linear", limit_direction="both")
X_test["dcoilwtico"]= X_test["dcoilwtico"].interpolate(method="linear", limit_direction="both")

# I just to do a rolling average to smooth out any problems with the empty values,
# and to capture any effect of changes. 
X["dcoilwtico"] = X["dcoilwtico"].rolling(
    window=30,       
    min_periods=1,  
).mean()

X_test["dcoilwtico"] = X_test["dcoilwtico"].rolling(
    window=30,       
    min_periods=1,  
).mean()

### Time Based Varriables

In [12]:
# Time variables
X["day"] = X.index.dayofweek
X["dayofyear"] = X.index.dayofyear
X["month"] = X.index.month
X["year"] = X.index.year

# This varible says whether it is a workday.
X["workday"] = (((X.bridge.isnull()) & (X.holiday==False)) & ((X["day"]!=5) & (X["day"]!=6)))
X.drop("bridge", axis=1, inplace=True)

# In Ecudor, people get paid on the 15 and the last day of the month
X["payday"] = ((X.index.day==15) | (X.index.day==X.index.to_series().dt.days_in_month)) 

In [13]:
# Time variables
X_test["day"] = X_test.index.dayofweek
X_test["dayofyear"] =X_test.index.dayofyear
X_test["month"] = X_test.index.month
X_test["year"] = X_test.index.year

# This varible says whether it is a workday.
X_test["workday"] = (((X_test.bridge.isnull()) & (X_test.holiday==False)) & ((X_test["day"]!=5) & (X_test["day"]!=6)))
X_test.drop("bridge", axis=1, inplace=True)

# In Ecudor, people get paid on the 15 and the last day of the month
X_test["payday"] = ((X_test.index.day==15) | (X_test.index.day==X_test.index.to_series().dt.days_in_month)) 

### Data Type

In [14]:
# Fixing data type
X_test = X_test.reset_index()
X_test = X_test.set_index("date")

X_test["onpromotion"] = X_test["onpromotion"].astype('float')
X_test["total_other_promo_store"] = X_test["total_other_promo_store"].astype('float')
X_test["total_other_city_promo"] = X_test["total_other_city_promo"].astype('float')
X_test["holiday"] = X_test["holiday"].astype('float')

X_test["family"] = X_test["family"].astype('category')
X_test["store_nbr"] = X_test["store_nbr"].astype('category')
X_test["holiday"] = X_test["holiday"].astype('category')
X_test["event"] = X_test["event"].astype('category')
X_test["city"] = X_test["city"].astype('category')
X_test["state"] = X_test["state"].astype('category')
X_test["type"] = X_test["type"].astype('category')
X_test["workday"] = X_test["workday"].astype('category')
X_test["payday"] = X_test["payday"].astype('category')
X_test["holiday_description"] = X_test["holiday_description"].astype('category')

In [15]:
X = X.reset_index()
X = X.set_index("date")

X["onpromotion"] = X["onpromotion"].astype('float')
X["total_other_promo_store"] = X["total_other_promo_store"].astype('float')
X["total_other_city_promo"] = X["total_other_city_promo"].astype('float')
X["holiday"] = X["holiday"].astype('float')

X["family"] = X["family"].astype('category')
X["store_nbr"] = X["store_nbr"].astype('category')
X["holiday"] = X["holiday"].astype('category')
X["event"] = X["event"].astype('category')
X["city"] = X["city"].astype('category')
X["state"] = X["state"].astype('category')
X["type"] = X["type"].astype('category')
X["workday"] = X["workday"].astype('category')
X["payday"] = X["payday"].astype('category')
X["holiday_description"] = X["holiday_description"].astype('category')

### Lagged Variables

In [16]:
def make_lags(data, column, lags):
    '''Takes Data and creates lagged features for every catergory'''
    for k in range(1, lags+1):
        data[f"{column}_lag_{k}"] = data.groupby(["store_nbr", "family"])[column].shift(k)

def make_one_year_lag(data, column):
    '''Takes Data and retrieves the values from the previous year'''
    data[f"{column}_one_year_lag"] = data.groupby(["store_nbr", "family", "dayofyear"])[column].shift(1)
    
    # Any after a year is just the result of the store being closed
    data[f"{column}_one_year_lag"] = data[f"{column}_one_year_lag"].fillna(0)

In [17]:
X_lag = cudf.concat([X[["store_nbr", "family", "dayofyear", "onpromotion", "dcoilwtico", "sales"]], X_test[["store_nbr", "family", "onpromotion", "dcoilwtico"]]], axis=0)
X_lag = X_lag.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

X_lag["dayofyear"] = X_lag.index.dayofyear
        
make_lags(X_lag, "onpromotion", 7)
make_lags(X_lag, "dcoilwtico", 7)

make_one_year_lag(X_lag, "sales")

X_lag = X_lag.drop(["dayofyear", "onpromotion", "dcoilwtico", "sales"], axis=1)

X = X.merge(X_lag, on=["date", "store_nbr", "family"], how="left")
X_test = X_test.merge(X_lag, on=["date", "store_nbr", "family"], how="left")

del X_lag

X["Change_in_oil_prices"] = X["dcoilwtico"]-X["dcoilwtico_lag_1"]
X_test["Change_in_oil_prices"] = X_test["dcoilwtico"]-X_test["dcoilwtico_lag_1"]
X["Change_in_oil_prices"] = X["Change_in_oil_prices"].astype('float')
X_test["Change_in_oil_prices"] = X_test["Change_in_oil_prices"].astype('float')

X["promo_last_7_days"] = X[X.columns[X.columns.str.find("onpromotion_lag")==0]].sum(axis=1)
X_test["promo_last_7_days"] = X_test[X_test.columns[X_test.columns.str.find("onpromotion_lag")==0]].sum(axis=1)
X["promo_last_7_days"] = X["promo_last_7_days"].astype('float')
X_test["promo_last_7_days"] = X_test["promo_last_7_days"].astype('float')

### Final Dataframe

In [18]:
y = X[["store_nbr", "family", "sales"]]
X.drop("sales", axis=1, inplace=True)

X.head()

Unnamed: 0_level_0,id,store_nbr,family,onpromotion,city,state,type,total_other_promo_store,total_other_city_promo,holiday,...,dcoilwtico_lag_1,dcoilwtico_lag_2,dcoilwtico_lag_3,dcoilwtico_lag_4,dcoilwtico_lag_5,dcoilwtico_lag_6,dcoilwtico_lag_7,sales_one_year_lag,Change_in_oil_prices,promo_last_7_days
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-10,17490,5,AUTOMOTIVE,0.0,Santo Domingo,Santo Domingo de los Tsachilas,D,0.0,0.0,0.0,...,93.146207,93.190383,93.084,93.184325,93.363801,93.439028,93.162527,0.0,0.364793,0.0
2013-01-10,17491,5,BABY CARE,0.0,Santo Domingo,Santo Domingo de los Tsachilas,D,0.0,0.0,0.0,...,93.144,93.192838,93.088,93.184348,93.393222,93.417803,93.154527,0.0,0.39,0.0
2013-01-10,17492,5,BEAUTY,0.0,Santo Domingo,Santo Domingo de los Tsachilas,D,0.0,0.0,0.0,...,93.14,93.195292,93.092,93.18437,93.393232,93.496667,93.146527,0.0,0.417,0.0
2013-01-10,17493,5,BEVERAGES,0.0,Santo Domingo,Santo Domingo de los Tsachilas,D,0.0,0.0,0.0,...,93.136,93.21,93.096,93.184393,93.393242,93.496,93.138527,0.0,0.444,0.0
2013-01-10,17494,5,BOOKS,0.0,Santo Domingo,Santo Domingo de los Tsachilas,D,0.0,0.0,0.0,...,93.191429,93.197745,93.1,93.184415,93.393253,93.495333,93.130527,0.0,0.411571,0.0


In [19]:
# # Removing early time with NaNs
X = X.loc[X.index >= "2015-07-01"]
y = y.loc[y.index >= "2015-07-01"]

# X = X.loc[X.index >= "2016-01-01"]
# y = y.loc[y.index >= "2016-01-01"]

## Trainning Model
###  Imports

In [20]:
# Data Preprocessing 
from cuml.dask.preprocessing import OneHotEncoder, LabelEncoder
from cuml.preprocessing import MinMaxScaler, StandardScaler, SimpleImputer, LabelEncoder, OneHotEncoder
from cuml.compose import make_column_transformer
from statsmodels.tsa.deterministic import CalendarFourier
from sklearn.pipeline import Pipeline

# Cross-Validation
from sklearn.model_selection import TimeSeriesSplit

# Models
from sklearn.dummy import DummyRegressor
from cuml.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from cuml.neighbors import KNeighborsRegressor
from cuml.ensemble import RandomForestRegressor
from cuml.metrics import mean_squared_error, mean_squared_log_error
from bayes_opt import BayesianOptimization
from xgboost import XGBRegressor

# Torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from data_wrangling import Prepare_data
from hybrid_timeseries import Hybrid_Pipeline

import warnings

from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events

# Ignore WarningS
warnings.filterwarnings('ignore')

### Pytorch

In [21]:
# Function to perform a basic operation on the GPU and return the result
def basic_gpu_operation(device):
    # Create a random tensor of size (1000, 1000) on the specified device
    x = torch.rand((1000, 1000), device=device)
    # Perform a basic arithmetic operation (e.g., matrix multiplication with its transpose)
    result = torch.matmul(x, x.t())
    # Return the sum of the result to ensure a scalar value is returned
    return result.sum()

# Check if CUDA is available
if torch.cuda.is_available():
    # Print the total number of GPUs detected
    gpu_count = torch.cuda.device_count()
    print(f'Total GPUs detected: {gpu_count}\n')
    # Initialize a list to hold the results from each GPU
    results = []
    # Loop through all available GPUs, print their properties, perform computations, and gather results
    for i in range(gpu_count):
        device = torch.device(f'cuda:{i}')
        gpu_properties = torch.cuda.get_device_properties(i)
        print(f"Device {i}: {gpu_properties.name}")
        print(f"  Total memory: {gpu_properties.total_memory / 1e9} GB")
        print(f"  CUDA Capability: {gpu_properties.major}.{gpu_properties.minor}")
        print(f"  MultiProcessor Count: {gpu_properties.multi_processor_count}")
        print(f'  Performing computation on Device {i}...\n')
        # Perform the basic operation on the GPU and append the result to the results list
        result = basic_gpu_operation(device)
        results.append(result.item())  # Convert to Python number and append
    # Summarize and print the results from each GPU
    print('Summary of Results:')
    for i, result in enumerate(results):
        print(f'Result from GPU {i}: {result:.2f}')
    # Perform some aggregation on the CPU (e.g., compute the average of all results)
    results = cp.array(results)
    average_result = cp.mean(results)
    print(f'\nAverage result from all GPUs: {average_result:.2f}')
    # Optionally, provide a summary of overall GPU utilization or performance here
    # This could involve more detailed metrics based on your specific use case or application
else:
    print("CUDA is not available. Please check your installation.")

Total GPUs detected: 1

Device 0: NVIDIA GeForce RTX 4070
  Total memory: 12.878086144 GB
  CUDA Capability: 8.9
  MultiProcessor Count: 46
  Performing computation on Device 0...

Summary of Results:
Result from GPU 0: 250596928.00

Average result from all GPUs: 250596928.00


In [22]:
class MSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse=nn.MSELoss().to(device)
        
    def forward(self, pred, actual):
        return self.mse(torch.log(pred+1), torch.log(actual + 1))
    
    
class LSTMModel(nn.Module):
    def __init__(self, input_layer, n_hidden_1, n_hidden_2, drop):
        super(LSTMModel, self).__init__()
        
        self.input_layer = input_layer
        self.n_hidden_1 = n_hidden_1
        self.n_hidden_2 = n_hidden_2
        
        # Layers: Linear, LSTM, Linear
        self.linear1 = nn.Linear(input_layer, n_hidden_1)
        self.dropout = nn.Dropout(drop)
        self.lstm = nn.LSTM(n_hidden_1, n_hidden_2, batch_first=True)
        self.linear2 = nn.Linear(n_hidden_2, 1)
        self.ReLU = nn.ReLU()
        
    def forward(self, x):
        
        x = self.linear1(x)
        x = self.dropout(x)
        
        output, (h_t, c_t) = self.lstm(x)
        output = self.dropout(output)
        output = self.linear2(output)
        output = self.ReLU(output)
        return output


class LSTMRegressor():
    def __init__(self, n_hidden=50, n_hidden_2=20, drop=0.2, epochs=100, early_stop=5, lr=0.01, Boosted=False, verbose=False):
        
        self.n_hidden = n_hidden
        self.n_hidden_2 = n_hidden_2
        self.drop = drop
        if Boosted:
            self.criterion = nn.MSELoss().to(device)
        else: 
            self.criterion = MSLELoss()
            
        self.early_stop = early_stop 
        self.epochs = epochs 
        self.lr = lr
        self.min_val_loss = float('inf')
        self.min_val_loss_2 = float('inf')
        self.verbose = verbose
    def train(self, train_loader):
        
        self.model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch, y_batch

            self.optimizer.zero_grad()
            outputs = self.model(x_batch)
            loss = self.criterion(outputs, y_batch)
            loss.backward()
            self.optimizer.step()


    def pred(self, test_loader, valid=False, epoch=0):
        
        self.model.eval()
        if valid:
             
            val_losses = 0
            num = 0
            
            with torch.no_grad():
                for x_batch, y_batch in test_loader:
                    x_batch = x_batch
                    outputs = self.model(x_batch)

                    loss = self.criterion(outputs, y_batch)
                    val_losses=+loss.item()

                    num=+1

            val_loss = val_losses/num

            if val_loss<self.min_val_loss:
            
                self.min_val_loss = val_loss
                self.early_stop_count = 0
            else:
                self.early_stop_count+=1
            
            if self.verbose:
                print(f"Epoch {epoch+1}/{self.epochs}, Validation score of {np.sqrt(val_loss):.4f}")
            if self.early_stop_count>=self.early_stop:
                if self.verbose:
                    print(f"early stopping at Validation Score of {np.sqrt(self.min_val_loss):.4f}")
                    print()
                self.stop = True
            
        else:
            
            with torch.no_grad():
                predictions = []
                for x_batch in test_loader:
                    x_batch = x_batch.to(device)
                    outputs = self.model(x_batch)
                    predictions.append(outputs.cpu().numpy())

                return np.concatenate(predictions)
                
    def fit(self, X, y):
        
        
        if isinstance(X, list):
            X_train, y_train = X[0], y[0]
            self.model = LSTMModel(X_train.shape[1], n_hidden_1=self.n_hidden, n_hidden_2= self.n_hidden_2, drop=self.drop).to(device)
            
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
            train_loader = DataLoader(TensorDataset(X_train.to(device), y_train.to(device)), batch_size=31, shuffle=False)
            
            X_valid, y_valid = X[1], y[1]
            test_loader = DataLoader(TensorDataset(X_valid.to(device), y_valid.to(device)), batch_size=31, shuffle=False)
            
            self.stop=False
            self.early_stop_count =0 
            
            for epoch in range(self.epochs):
                self.train(train_loader)
                
                self.pred(test_loader, valid=True, epoch=epoch)
                if self.stop:
                    break

        else:
                X_train, y_train = X, y
                self.model = LSTMModel(X_train.shape[1], n_hidden_1=self.n_hidden, n_hidden_2= self.n_hidden_2, drop=self.drop).to(device)
                
                self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
                train_loader = DataLoader(TensorDataset(X_train.to(device), y_train.to(device)), batch_size=31, shuffle=False)
                
                for epoch in range(self.epochs):
                    self.train(train_loader)
    
    def predict(self, X):
        
        test_loader = DataLoader(X.to(device), batch_size=31, shuffle=False)
        
        outputs = self.pred(test_loader)
        
        return outputs

### Trainning

In [23]:
# Define the preprocessing steps
numeric_transformer = ["float", StandardScaler()]
categorical_transformer = ["category", OneHotEncoder(sparse=False, handle_unknown='ignore')]

column_list = ["time_since_quake", "time_since_quake_sq"]

#data_preprocessor = Prepare_data(column_list, [numeric_transformer, categorical_transformer])
data_preprocessor = Prepare_data(column_list, [numeric_transformer])

## Linear Regression, XGBoost, Boosted

In [24]:
X_C = X.copy()

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
# X_C = X_C.set_index(["date"])
# y = y.set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
#.drop(["state", "city", "type", "dayofyear", "year"], axis=1)
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [25]:
# def feature_importance(X, y, model):
    
#     dates = X.index.drop_duplicates()
    
#     X_train = X.loc[dates[int(len(dates)*2/3):]]
#     X_valid = X.loc[dates[:int(len(dates)*2/3)]]
    
#     y_train = y.loc[dates[int(len(dates)*2/3):]]
#     y_valid = y.loc[dates[:int(len(dates)*2/3)]]
    
#     model.fit(X_train, y_train)
    
#     del X_train
#     del y_train
    
#     pred = model.predict(X_valid)
#     baseline = float(np.sqrt(mean_squared_log_error(y_valid.sales, pred.sales)))
#     importance_dict = {}
    
#     for i in range(1, len(X_valid.columns)):
        
#         name = X_valid.columns[i]
#         X_shuffle = X_valid.copy()
#         X_shuffle = X_shuffle.to_pandas()
#         X_shuffle[name] = X_shuffle[name].values[np.random.permutation(len(X_shuffle))]
        
#         X_shuffle = cudf.from_pandas(X_shuffle)
#         pred = model.predict(X_shuffle)
        
#         del X_shuffle
        
#         importance_dict[name] = float(np.sqrt(mean_squared_log_error(y_valid.sales, pred.sales))) - baseline
        
        
#     return importance_dict
        
    
# xgb_params = {
#     'tree_method': 'gpu_hist',  # Specify GPU usage
#     'predictor': 'gpu_predictor',
#     'enable_categorical': True,
# }

# xgb = XGBRegressor(**xgb_params)


# lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

# model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=True, to_tensor=False)

# feature_df = pd.DataFrame.from_dict(feature_importance(X_C, y, model), orient='index', columns=['Change in MSE']).reset_index().sort_values("Change in MSE")
# feature_df = feature_df.rename({"index": "Columns"}, axis=1)
# feature_df.plot.barh(x='Columns', y='Change in MSE', title='Feature Importance', color='blue')

#X_C = X_C[feature_df.loc[feature_df["Change in MSE"]>=0]["Columns"].append(pd.Series(["store_nbr", "family", "id"]), ignore_index=True)]

In [26]:
def Time_Series_CV(model, X_C, y, splits=4, verbose=False):
    # Use time series split for cross validation. 
    cv_split = TimeSeriesSplit(n_splits = splits)
    
    # Create lists to append MSLE scores.
    valid_msle = []
    train_msle = []
    
    # Dates to index through. 
    dates = X_C.index.drop_duplicates()
    a = 0
    
    # Perform Cross-Validation to determine how model will do on unseen data.
    for train_index, valid_index in cv_split.split(dates):

        # Index dates.
        date_train, date_valid = dates[train_index], dates[valid_index]

        # Selecting data for y_train and y_valid.
        y_train = y.loc[date_train]
        y_valid = y.loc[date_valid]

        # Selecting data for X_train and X_valid.
        X_train = X_C.loc[date_train]
        X_valid = X_C.loc[date_valid]

        X_train = X_train.reset_index().sort_values(["store_nbr", "family", "date"])
        X_valid = X_valid.reset_index().sort_values(["store_nbr", "family", "date"])
        X_train = X_train.set_index(["date"])
        X_valid = X_valid.set_index(["date"])

        y_train = y_train.reset_index().sort_values(["store_nbr", "family", "date"])
        y_valid = y_valid.reset_index().sort_values(["store_nbr", "family", "date"])
        y_train = y_train.set_index(["date"])
        y_valid = y_valid.set_index(["date"])


        # Fitting model.
        model.fit(X_train, y_train)

        # Create predictions for Trainning and Validation.
        pred = model.predict(X_valid)

        # MSE for trainning and validation. 
        valid_msle.append(float(mean_squared_log_error(y_valid["sales"], pred["sales"])))
        
        if verbose:
            # Create predictions for Trainning and Validation.
            fit = model.predict(X_train)
        
            # MSE for trainning and validation. 
            train_msle.append(float(mean_squared_log_error(y_train["sales"], fit["sales"])))
            
            a = a+1
            print(f"Fold {a}:") 
            print(f"Training RMSLE: {cp.sqrt(mean_squared_log_error(y_train.sales, fit.sales)):.3f}, Validation RMSLE: {cp.sqrt(mean_squared_log_error(y_valid.sales, pred.sales)):.3f}")
        
    if verbose:
        # Returns the square root of the average of the MSE.
        print("Average Across Folds")
        print(f"Training RMSLE:{np.sqrt(np.mean(train_msle)):.3f}, Validation RMSLE: {np.sqrt(np.mean(valid_msle)):.3f}")
        
    return float(np.sqrt(np.mean(valid_msle)))

In [28]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)
            
    xgb_params = {
        'tree_method': 'gpu_hist',  # Specify GPU usage
        'predictor': 'gpu_predictor',
        'enable_categorical': True,}

    model_2 = XGBRegressor(**xgb_params)

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
        
    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    data_preprocessor = Prepare_data(list1, [numeric_transformer])
    
    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)
    
    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (0, 1) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log1")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log1"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
data_preprocessor = Prepare_data(list1, [numeric_transformer])

In [29]:
def hyperparameter_optimization(n_estimators, gamma, subsample, max_depth, learning_rate):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    params = {
    'tree_method': 'gpu_hist',  # Specify GPU usage
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'n_estimators': n_estimators,
    'gamma': gamma,
    'subsample': subsample}

    model_2 = XGBRegressor(**params)

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

parambounds = {
    'learning_rate': (0.00001, 1),
    'n_estimators': (0, 1000),
    'max_depth': (3,12),
    'subsample': (0, 1.0),  
    'gamma': (1, 10),
    
}

optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log2")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log2"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50)
print(optimizer.max)

{'target': -0.7200944000389895, 'params': {'gamma': 1.8341142880593395, 'learning_rate': 0.022189817637013276, 'max_depth': 11.75055013991161, 'n_estimators': 841.6284665485007, 'subsample': 0.6573582590545632}}


In [30]:
params = optimizer.max["params"]

xgb_params = {
    'tree_method': 'gpu_hist',  # Specify GPU usage
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'max_depth': int(params["max_depth"]),
    'learning_rate': params["learning_rate"],
    'n_estimators': int(params["n_estimators"]),
    'gamma': params["gamma"],
    'subsample': params["subsample"]
}

xgb = XGBRegressor(**xgb_params)


lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=True, to_tensor=False)

e_1 = 1/Time_Series_CV(model, X_C, y, verbose=True)

Fold 1:
Training RMSLE: 0.871, Validation RMSLE: 0.754
Fold 2:
Training RMSLE: 0.806, Validation RMSLE: 0.750
Fold 3:
Training RMSLE: 0.842, Validation RMSLE: 0.746
Fold 4:
Training RMSLE: 0.761, Validation RMSLE: 0.622
Average Across Folds
Training RMSLE:0.821, Validation RMSLE: 0.720


In [31]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=True, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_1 = model.predict(X_test_C)

## Linear Regression, XGBoost, Stacked

In [32]:
X_C = X.copy()

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
# X_C = X_C.set_index(["date"])
# y = y.set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
#.drop(["state", "city", "type", "dayofyear", "year"], axis=1)
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [33]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)
            
    xgb_params = {
        'tree_method': 'gpu_hist',  # Specify GPU usage
        'predictor': 'gpu_predictor',
        'enable_categorical': True,}

    model_2 = XGBRegressor(**xgb_params)

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    data_preprocessor = Prepare_data(list1, [numeric_transformer])
    
    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (False, True) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log3")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log3"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
data_preprocessor = Prepare_data(list1, [numeric_transformer])

In [34]:
def hyperparameter_optimization(n_estimators, gamma, subsample, max_depth, learning_rate):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    
    params = {
    'tree_method': 'gpu_hist',  # Specify GPU usage
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'n_estimators': n_estimators,
    'gamma': gamma,
    'subsample': subsample}

    model_2 = XGBRegressor(**params)
  
    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

parambounds = {
    'learning_rate': (0.00001, 1),
    'n_estimators': (0, 500),
    'max_depth': (3,12),
    'subsample': (0.0001, 1.0),  
    'gamma': (3, 8),
    
}

optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log4")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log4"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50,)
print(optimizer.max)

{'target': -0.6686000891811915, 'params': {'gamma': 3.817714029576126, 'learning_rate': 0.3625283226197647, 'max_depth': 11.818925416760282, 'n_estimators': 7.132832399346034, 'subsample': 0.8966917484334722}}


In [35]:
params = optimizer.max["params"]

xgb_params = {
    'tree_method': 'gpu_hist',  # Specify GPU usage
    'predictor': 'gpu_predictor',
    'enable_categorical': True,
    'max_depth': int(params["max_depth"]),
    'learning_rate': params["learning_rate"],
    'n_estimators': int(params["n_estimators"]),
    'gamma': params["gamma"],
    'subsample': params["subsample"]
}

xgb = XGBRegressor(**xgb_params)


lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=False, to_tensor=False)

e_2 = 1/Time_Series_CV(model, X_C, y, verbose=True)

Fold 1:
Training RMSLE: 0.612, Validation RMSLE: 0.694
Fold 2:
Training RMSLE: 0.642, Validation RMSLE: 0.711
Fold 3:
Training RMSLE: 0.634, Validation RMSLE: 0.686
Fold 4:
Training RMSLE: 0.628, Validation RMSLE: 0.575
Average Across Folds
Training RMSLE:0.629, Validation RMSLE: 0.669


In [36]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=False, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_2 = model.predict(X_test_C)

## Linear Regression, Random Forest Regressor, Boosted

In [37]:
X_C = X.copy()

X_C["family"] = X_C["family"].cat.codes
X_C["store_nbr"] = X_C["store_nbr"].cat.codes
X_C["holiday"] = X_C["holiday"].cat.codes
X_C["event"] = X["event"].cat.codes
X_C["city"] = X_C["city"].cat.codes
X_C["state"] = X_C["state"].cat.codes
X_C["type"] = X_C["type"].cat.codes
X_C["payday"] = X_C["payday"].cat.codes
X_C["workday"] = X_C["workday"].cat.codes
X_C["holiday_description"] = X_C["holiday_description"].cat.codes

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [38]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)

    model_2 = RandomForestRegressor()

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
    data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (False, True) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log5")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log5"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    


KeyboardInterrupt



In [None]:
def hyperparameter_optimization(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, max_samples, criterion):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    
    if max_features>1:
        if max_features>2:
            max_features = 'sqrt'
        else:
            max_features = 'auto'
    else:
        max_features = 'log2'

    if criterion>1:
        criterion = 'mse'
    else:
        criterion = 'mae'
    
    params = {
    'n_estimators': int(round(n_estimators)),
    'max_depth': int(round(max_depth)),
    'min_samples_split': int(round(min_samples_split)),
    'min_samples_leaf': int(round(min_samples_leaf)),
    'max_features': int(round(max_features)),
    'bootstrap': int(round(bootstrap)),
    'max_samples': max_samples,
    'criterion': criterion}

    model_2 = RandomForestRegressor(**params)
    
    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)

    return -Time_Series_CV(model, X_C, y, verbose=True)

param_bounds = {
    'n_estimators': (100, 1000),  # Number of trees in the forest
    'max_depth': (3, 20),  # Maximum depth of the trees
    'min_samples_split': (2, 20),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': (1, 20),  # Minimum number of samples required to be at a leaf node
    'max_features': (0, 3),  # Number of features to consider when looking for the best split
    'bootstrap': (True, False),  # Whether bootstrap samples are used when building trees
    'max_samples': (0.1, 1.0),  # Number of samples to draw from X to train each base estimator
    'criterion': (0, 1),  # The function used to measure the quality of a split
}
    
optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log6")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log6"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50,)
print(optimizer.max)

In [None]:
params = optimizer.max["params"]

if params[4]>1:
    if params[4]>2:
        max_features = 'sqrt'
    else:
        max_features = 'auto'
else:
    max_features = 'log2'
    
if params[7]>1:
    criterion = 'mse'
else:
    criterion = 'mae'
    
rfr_params = {
    'n_estimators': int(round(params[0])),  # Number of trees in the forest
    'max_depth': int(round(params[1])),  # Maximum depth of the trees
    'min_samples_split': int(round(params[2])),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': int(round(params[3])),  # Minimum number of samples required to be at a leaf node
    'max_features': max_features,  # Number of features to consider when looking for the best split
    'bootstrap': int(round(params[5])),  # Whether bootstrap samples are used when building trees
    'max_samples': params[6],  # Number of samples to draw from X to train each base estimator
    'criterion': criterion,  # The function used to measure the quality of a split
}

rfr = RandomForestRegressor(**rfr_params)

lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, rfr, Boosted=True, to_tensor=False)

e_3 = 1/Time_Series_CV(model, X_C, y, verbose=True)

In [None]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, rfr, Boosted=True, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_3 = model.predict(X_test_C)

## Linear Regression, Random Forest Regressor, Stacked

In [None]:
X_C = X.copy()

X_C["family"] = X_C["family"].cat.codes
X_C["store_nbr"] = X_C["store_nbr"].cat.codes
X_C["holiday"] = X_C["holiday"].cat.codes
X_C["event"] = X["event"].cat.codes
X_C["city"] = X_C["city"].cat.codes
X_C["state"] = X_C["state"].cat.codes
X_C["type"] = X_C["type"].cat.codes
X_C["payday"] = X_C["payday"].cat.codes
X_C["workday"] = X_C["workday"].cat.codes
X_C["holiday_description"] = X_C["holiday_description"].cat.codes

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [None]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)

    model_2 = RandomForestRegressor()

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
    data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (0, 1) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log7")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log7"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    

In [None]:
def hyperparameter_optimization(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features, bootstrap, max_samples, criterion):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    
    if max_features>1:
        if max_features>2:
            max_features = 'sqrt'
        else:
            max_features = 'auto'
    else:
        max_features = 'log2'

    if criterion>.5:
        criterion = 'mse'
    else:
        criterion = 'mae'
    
    params = {
    'n_estimators': int(round(n_estimators)),
    'max_depth': int(round(max_depth)),
    'min_samples_split': int(round(min_samples_split)),
    'min_samples_leaf': int(round(min_samples_leaf)),
    'max_features': max_features,
    'bootstrap': int(round(bootstrap)),
    'max_samples': max_samples,
    'criterion': criterion}

    model_2 = RandomForestRegressor(**params)

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

param_bounds = {
    'n_estimators': (100, 1000),  # Number of trees in the forest
    'max_depth': (3, 20),  # Maximum depth of the trees
    'min_samples_split': (2, 20),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': (1, 20),  # Minimum number of samples required to be at a leaf node
    'max_features': (0, 3),  # Number of features to consider when looking for the best split
    'bootstrap': (True, False),  # Whether bootstrap samples are used when building trees
    'max_samples': (0.1, 1.0),  # Number of samples to draw from X to train each base estimator
    'criterion': (0, 1),  # The function used to measure the quality of a split
}
    
optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log8")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log8"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50,)
print(optimizer.max)

In [None]:
params = optimizer.max["params"]

if params[4]>1:
    if params[4]>2:
        max_features = 'sqrt'
    else:
        max_features = 'auto'
else:
    max_features = 'log2'
    
if params[7]>.5:
    criterion = 'mse'
else:
    criterion = 'mae'
    
rfr_params = {
    'n_estimators': int(round(params[0])),  # Number of trees in the forest
    'max_depth': int(round(params[1])),  # Maximum depth of the trees
    'min_samples_split': int(round(params[2])),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': int(round(params[3])),  # Minimum number of samples required to be at a leaf node
    'max_features': max_features,  # Number of features to consider when looking for the best split
    'bootstrap': int(round(params[5])),  # Whether bootstrap samples are used when building trees
    'max_samples': params[6],  # Number of samples to draw from X to train each base estimator
    'criterion': criterion,  # The function used to measure the quality of a split
}

rfr = RandomForestRegressor(**rfr_params)

lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, rfr, Boosted=False, to_tensor=False)

e_4 = 1/Time_Series_CV(model, X_C, y, verbose=True)

In [None]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, rfr, Boosted=False, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_4 = model.predict(X_test_C)

## Linear Regression, K-NN Regressor, Boosted

In [None]:
X_C = X.copy()

X_C["family"] = X_C["family"].cat.codes
X_C["store_nbr"] = X_C["store_nbr"].cat.codes
X_C["holiday"] = X_C["holiday"].cat.codes
X_C["event"] = X["event"].cat.codes
X_C["city"] = X_C["city"].cat.codes
X_C["state"] = X_C["state"].cat.codes
X_C["type"] = X_C["type"].cat.codes
X_C["payday"] = X_C["payday"].cat.codes
X_C["workday"] = X_C["workday"].cat.codes
X_C["holiday_description"] = X_C["holiday_description"].cat.codes

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [None]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)

    model_2 = KNeighborsRegressor()

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
    data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    
    
    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (0, 1) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log9")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log9"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    

In [None]:
def hyperparameter_optimization(n_neighbors, weights, algorithm, leaf_size, metric):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    if metric>1:
        if metric>2:
            metric = 'euclidean'
        else:
            metric = 'manhattan'
    else:
        metric = 'minkowski'


    if algorithm>1:
        if algorithm>2:
            algorithm = 'auto'
        else:
            if algorithm>3:
                algorithm = 'ball_tree'
            else:
                algorithm = 'kd_tree'
    else:
        algorithm = 'brute'


    if weights>.5:
        weights = 'uniform'
    else:
        weights = 'distance'
    
    param = {
    'n_neighbors': int(round(n_neighbors)),
    'weights': weights,
    'algorithm': algorithm,
    'leaf_size': int(round(leaf_size)),
    'metric': metric}

    model_2 = KNeighborsRegressor(**params)

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

param_bounds = {
    'n_neighbors': (1, 20),
    'weights': (0, 1),
    'algorithm': (0, 4),
    'leaf_size': (10, 50),
    'metric': (0, 3),
}
    
optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log10")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log10"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50,)
print(optimizer.max)

In [None]:
params = optimizer.max["params"]

if params[4]>1:
    if params[4]>2:
        metric = 'euclidean'
    else:
        metric = 'manhattan'
else:
    metric = 'minkowski'


if params[2]>1:
    if params[2]>2:
        algorithm = 'auto'
    else:
        if params[2]>3:
            algorithm = 'ball_tree'
        else:
            algorithm = 'kd_tree'
else:
    algorithm = 'brute'


if params[1]>.5:
    weights = 'uniform'
else:
    weights = 'distance'

param = {'n_neighbors': int(round(params[0])),
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': int(round(params[3])),
        'metric': metric}

knn = KNeighborsRegressor(**knn_params)

lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, knn, Boosted=True, to_tensor=False)
 
e_5 = 1/Time_Series_CV(model, X_C, y, verbose=True)

In [None]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, knn, Boosted=True, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_5 = model.predict(X_test_C)

## Linear Regression, K-NN Regressor, Stacked

In [None]:
X_C = X.copy()

X_C["family"] = X_C["family"].cat.codes
X_C["store_nbr"] = X_C["store_nbr"].cat.codes
X_C["holiday"] = X_C["holiday"].cat.codes
X_C["event"] = X["event"].cat.codes
X_C["city"] = X_C["city"].cat.codes
X_C["state"] = X_C["state"].cat.codes
X_C["type"] = X_C["type"].cat.codes
X_C["payday"] = X_C["payday"].cat.codes
X_C["workday"] = X_C["workday"].cat.codes
X_C["holiday_description"] = X_C["holiday_description"].cat.codes

X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

X_test_C = X_test.copy()
X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
X_test_C = X_test_C.set_index(["date"])

In [None]:
def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
                       holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
                       dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
                       onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
                       onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
                       dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
                       dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
    # Convert non-integer arguments to integers
    variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
                        int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
                        int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
                        int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
                        int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
                        int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
                        int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
                        int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
    X_C = X_C.copy()
    column_to_remove = []
    for i in range(3, X_C.shape[1]):
        if variable_list[i-3]==0:
            column_to_remove.append(X_C.columns[i])
    
    X_C.drop(column_to_remove, axis=1, inplace=True)

    model_2 = KNeighborsRegressor()

    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    list1 = ["time_since_quake", "time_since_quake_sq"]
    list2 = X_C.columns

    # Convert lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Find the values in set1 that are not in set2
    uncommon_values = set1 - set2

    # Remove the uncommon values from list1
    list1 = [value for value in list1 if value not in uncommon_values]
    
    numeric_transformer = ["float", StandardScaler()]
    categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
    data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

# Define the parameter space for Bayesian optimization (each feature is a parameter)
params = {X_C.columns[i]: (0, 1) for i in range(3, X_C.shape[1])}

# Initialize the Bayesian optimizer
optimizer = BayesianOptimization(
    f=objective_function,
    pbounds=params,
    random_state=1,  # For reproducibility
)

logger = JSONLogger(path="./Logs/logs.log11")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log11"])
except:
    pass

# Perform the optimization
optimizer.maximize(init_points=30, n_iter=70)

variables = list(optimizer.max["params"].values())
variables = [True, True, True] + [x>0.5 for x in variables]
X_C = X_C[X_C.columns[variables]]

list1 = ["time_since_quake", "time_since_quake_sq"]
list2 = X_C.columns

# Convert lists to sets
set1 = set(list1)
set2 = set(list2)

# Find the values in set1 that are not in set2
uncommon_values = set1 - set2

# Remove the uncommon values from list1
list1 = [value for value in list1 if value not in uncommon_values]

numeric_transformer = ["float", StandardScaler()]
categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    

In [None]:
def hyperparameter_optimization(n_neighbors, weights, algorithm, leaf_size, metric):
    
    
    model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    if metric>1:
        if metric>2:
            metric = 'euclidean'
        else:
            metric = 'manhattan'
    else:
        metric = 'minkowski'


    if algorithm>1:
        if algorithm>2:
            algorithm = 'auto'
        else:
            if algorithm>3:
                algorithm = 'ball_tree'
            else:
                algorithm = 'kd_tree'
    else:
        algorithm = 'brute'


    if weights>.5:
        weights = 'uniform'
    else:
        weights = 'distance'
    
    param = {
    'n_neighbors': int(round(n_neighbors)),
    'weights': weights,
    'algorithm': algorithm,
    'leaf_size': int(round(leaf_size)),
    'metric': metric}

    model_2 = KNeighborsRegressor(**params)

    model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=False, to_tensor=False)

    return -Time_Series_CV(model, X_C, y)

param_bounds = {
    'n_neighbors': (1, 20),
    'weights': (0, 1),
    'algorithm': (0, 4),
    'leaf_size': (10, 50),
    'metric': (0, 3),
}
    
optimizer = BayesianOptimization(
    f=hyperparameter_optimization,
    pbounds=parambounds,
    random_state=1,
)

logger = JSONLogger(path="./Logs/logs.log12")

optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

try:
    load_logs(optimizer, logs["./Logs/logs.log12"])
except:
    pass

optimizer.maximize(init_points=20, n_iter=50,)
print(optimizer.max)

In [None]:
params = optimizer.max["params"]

if params[4]>1:
    if params[4]>2:
        metric = 'euclidean'
    else:
        metric = 'manhattan'
else:
    metric = 'minkowski'


if params[2]>1:
    if params[2]>2:
        algorithm = 'auto'
    else:
        if params[2]>3:
            algorithm = 'ball_tree'
        else:
            algorithm = 'kd_tree'
else:
    algorithm = 'brute'


if params[1]>.5:
    weights = 'uniform'
else:
    weights = 'distance'

param = {'n_neighbors': int(round(params[0])),
        'weights': weights,
        'algorithm': algorithm,
        'leaf_size': int(round(params[3])),
        'metric': metric}

knn = KNeighborsRegressor(**knn_params)


lr = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

model = Hybrid_Pipeline(data_preprocessor, lr, knn, Boosted=False, to_tensor=False)

e_6 = 1/Time_Series_CV(model, X_C, y, verbose=True)

In [None]:
# Fit Model
model = Hybrid_Pipeline(data_preprocessor, lr, knn, Boosted=True, to_tensor=False)
model.fit(X_C, y)

X_test_C = X_test_C[X_test_C.columns[variables]]
pred_6 = model.predict(X_test_C)

## Linear Regression, LSTM Regressor, Boosted

In [None]:
# X_C = X.copy()

# X_C["family"] = X_C["family"].cat.codes
# X_C["store_nbr"] = X_C["store_nbr"].cat.codes
# X_C["holiday"] = X_C["holiday"].cat.codes
# X_C["event"] = X["event"].cat.codes
# X_C["city"] = X_C["city"].cat.codes
# X_C["state"] = X_C["state"].cat.codes
# X_C["type"] = X_C["type"].cat.codes
# X_C["payday"] = X_C["payday"].cat.codes
# X_C["workday"] = X_C["workday"].cat.codes
# X_C["holiday_description"] = X_C["holiday_description"].cat.codes

# X_C = X_C[["id", "store_nbr", "family"] + sorted(set(X_C.columns)-set(["id", "store_nbr", "family"]))]

# X_C = X_C.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])
# y = y.reset_index().sort_values(["store_nbr", "family", "date"]).set_index(["date"])

# X_test_C = X_test.copy()
# X_test_C = X_test_C[["id", "store_nbr", "family"] + sorted(set(X_test_C.columns)-set(["id", "store_nbr", "family"]))]
# X_test_C = X_test_C.reset_index().sort_values(["store_nbr", "family", "date"])
# X_test_C = X_test_C.set_index(["date"])

In [None]:
# def objective_function(onpromotion, total_other_promo_store, total_other_city_promo, holiday,
#                        holiday_description, event, time_since_quake, time_since_quake_sq, state, city, 
#                        dcoilwtico, day, month, workday, payday, onpromotion_lag_1, type, dayofyear, year,
#                        onpromotion_lag_2, onpromotion_lag_3, onpromotion_lag_4, onpromotion_lag_5,
#                        onpromotion_lag_6, onpromotion_lag_7, dcoilwtico_lag_1, dcoilwtico_lag_2, 
#                        dcoilwtico_lag_3, dcoilwtico_lag_4, dcoilwtico_lag_5, dcoilwtico_lag_6, 
#                        dcoilwtico_lag_7, sales_one_year_lag, Change_in_oil_prices, promo_last_7_days, X_C=X_C, y=y):
    
#     # Convert non-integer arguments to integers
#     variable_list = [int(round(Change_in_oil_prices)), int(round(city)),int(round(day)), int(round(dayofyear)), int(round(dcoilwtico)),
#                         int(round(dcoilwtico_lag_1)), int(round(dcoilwtico_lag_2)), int(round(dcoilwtico_lag_3)), int(round(dcoilwtico_lag_4)),
#                         int(round(dcoilwtico_lag_5)), int(round(dcoilwtico_lag_6)), int(round(dcoilwtico_lag_7)), int(round(event)),
#                         int(round(holiday)), int(round(holiday_description)), int(round(month)), int(round(onpromotion)),int(round(onpromotion_lag_1)),
#                         int(round(onpromotion_lag_2)), int(round(onpromotion_lag_3)), int(round(onpromotion_lag_4)), int(round(onpromotion_lag_5)),
#                         int(round(onpromotion_lag_6)), int(round(onpromotion_lag_7)), int(round(payday)), int(round(promo_last_7_days)),
#                         int(round(sales_one_year_lag)), int(round(state)), int(round(time_since_quake)), int(round(time_since_quake_sq)),
#                         int(round(total_other_city_promo)), int(round(total_other_promo_store)), int(round(type)), int(round(workday)), int(round(year))]
    
#     X_C = X_C.copy()
#     column_to_remove = []
#     for i in range(3, X_C.shape[1]):
#         if variable_list[i-3]==0:
#             column_to_remove.append(X_C.columns[i])
    
#     X_C.drop(column_to_remove, axis=1, inplace=True)

#     model_2 = LSTMRegressor(Boosted=True)

#     model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
#     # Use time series split for cross validation. 
#     cv_split = TimeSeriesSplit(n_splits = 4)
    
#     # Create lists to append MSLE scores.
#     valid_msle = []
    
#     # Dates to index through. 
#     dates = X_C.index.drop_duplicates()
    
    
#     list1 = ["time_since_quake", "time_since_quake_sq"]
#     list2 = X_C.columns

#     # Convert lists to sets
#     set1 = set(list1)
#     set2 = set(list2)

#     # Find the values in set1 that are not in set2
#     uncommon_values = set1 - set2

#     # Remove the uncommon values from list1
#     list1 = [value for value in list1 if value not in uncommon_values]
    
#     numeric_transformer = ["float", StandardScaler()]
#     categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
#     data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    
    
#     # Perform Cross-Validation to determine how model will do on unseen data.
#     for train_index, valid_index in cv_split.split(dates):

#         model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=True)

#         # Index dates.
#         date_train, date_valid = dates[train_index], dates[valid_index]

#         # Selecting data for y_train and y_valid.
#         y_train = y.loc[date_train]
#         y_valid = y.loc[date_valid]

#         # Selecting data for X_train and X_valid.
#         X_train = X_C.loc[date_train]
#         X_valid = X_C.loc[date_valid]

#         X_train = X_train.reset_index().sort_values(["store_nbr", "family", "date"])
#         X_valid = X_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#         X_train = X_train.set_index(["date"])
#         X_valid = X_valid.set_index(["date"])

#         y_train = y_train.reset_index().sort_values(["store_nbr", "family", "date"])
#         y_valid = y_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#         y_train = y_train.set_index(["date"])
#         y_valid = y_valid.set_index(["date"])


#         # Fitting model.
#         model.fit(X_train, y_train)

#         # Create predictions for Trainning and Validation.
#         pred = model.predict(X_valid)

#         # MSE for trainning and validation. 
#         valid_msle.append(float(mean_squared_log_error(y_valid["sales"], pred["sales"])))


#     return -float(np.sqrt(np.mean(valid_msle)))

# # Define the parameter space for Bayesian optimization (each feature is a parameter)
# params = {X_C.columns[i]: (0, 1) for i in range(3, X_C.shape[1])}

# # Initialize the Bayesian optimizer
# optimizer = BayesianOptimization(
#     f=objective_function,
#     pbounds=params,
#     random_state=1,  # For reproducibility
# )

# # Perform the optimization
# optimizer.maximize(init_points=30, n_iter=100)

# variables = list(optimizer.max["params"].values())
# variables = [True, True, True] + [x>0.5 for x in variables]
# X_C = X_C[X_C.columns[variables]]

# list1 = ["time_since_quake", "time_since_quake_sq"]
# list2 = X_C.columns

# # Convert lists to sets
# set1 = set(list1)
# set2 = set(list2)

# # Find the values in set1 that are not in set2
# uncommon_values = set1 - set2

# # Remove the uncommon values from list1
# list1 = [value for value in list1 if value not in uncommon_values]

# numeric_transformer = ["float", StandardScaler()]
# categorical_transformer = ["uint8", OneHotEncoder(sparse=False, handle_unknown='ignore')]
# data_preprocessor = Prepare_data(list1, [numeric_transformer, categorical_transformer])    

In [None]:
# def hyperparameter_optimization(n_hidden, n_hidden_2, drop, epochs, lr):
    
    
#     model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)
    
    
#     model_2 = LSTMRegressor(n_hidden=n_hidden, n_hidden_2=n_hidden_2, drop=drop, epochs=epochs, lr=lr, Boosted=True)
#     # Use time series split for cross validation. 
#     cv_split = TimeSeriesSplit(n_splits = 4)
    
#     # Create lists to append MSLE scores.
#     valid_msle = []

#     # Dates to index through. 
#     dates = X_C.index.drop_duplicates()

    
#     # Perform Cross-Validation to determine how model will do on unseen data.
#     for train_index, valid_index in cv_split.split(dates):

#         model = Hybrid_Pipeline(data_preprocessor, model_1, model_2, Boosted=True, to_tensor=True)

#         # Index dates.
#         date_train, date_valid = dates[train_index], dates[valid_index]

#         # Selecting data for y_train and y_valid.
#         y_train = y.loc[date_train]
#         y_valid = y.loc[date_valid]

#         # Selecting data for X_train and X_valid.
#         X_train = X_C.loc[date_train]
#         X_valid = X_C.loc[date_valid]

#         X_train = X_train.reset_index().sort_values(["store_nbr", "family", "date"])
#         X_valid = X_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#         X_train = X_train.set_index(["date"])
#         X_valid = X_valid.set_index(["date"])

#         y_train = y_train.reset_index().sort_values(["store_nbr", "family", "date"])
#         y_valid = y_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#         y_train = y_train.set_index(["date"])
#         y_valid = y_valid.set_index(["date"])


#         # Fitting model.
#         model.fit(X_train, y_train)

#         # Create predictions for Trainning and Validation.
#         pred = model.predict(X_valid)

#         # MSE for trainning and validation. 
#         valid_msle.append(float(mean_squared_log_error(y_valid["sales"], pred["sales"])))


#     return -float(np.sqrt(np.mean(valid_msle)))

# param_bounds = {
#     'n_hidden': (1, 20),
#     'n_hidden_2': (0, 1),
#     'drop': (0, 1),
#     'epochs': (10, 500),
#     'lr': (0, 1),
# }
    
# optimizer = BayesianOptimization(
#     f=hyperparameter_optimization,
#     pbounds=parambounds,
#     random_state=1,
# )

# optimizer.maximize(init_points=30, n_iter=100,)
# print(optimizer.max)

In [None]:
# params = optimizer.max["params"]

# if params[4]>1:
#     if params[4]>2:
#         metric = 'euclidean'
#     else:
#         metric = 'manhattan'
# else:
#     metric = 'minkowski'


# if params[2]>1:
#     if params[2]>2:
#         algorithm = 'auto'
#     else:
#         if params[2]>3:
#             algorithm = 'ball_tree'
#         else:
#             algorithm = 'kd_tree'
# else:
#     algorithm = 'brute'


# if params[1]>.5:
#     weights = 'uniform'
# else:
#     weights = 'distance'

# param = {'n_neighbors': int(round(params[0])),
#         'weights': weights,
#         'algorithm': algorithm,
#         'leaf_size': int(round(params[3])),
#         'metric': metric}

# knn = KNeighborsRegressor(**knn_params)


# model_1 = LinearRegression(fit_intercept=False, algorithm="svd", copy_X=True)

# # Use time series split for cross validation. 
# cv_split = TimeSeriesSplit(n_splits = 4)

# # Create lists to append MSE scores. 
# train_msle = []
# valid_msle = []

# # Dates to index through. 
# dates = X_C.index.drop_duplicates()
# a = 0
# # Perform Cross-Validation to determine how model will do on unseen data.
# for train_index, valid_index in cv_split.split(dates):
#     a = a+1
#     print(f"Fold {a}:") 
#     model = Hybrid_Pipeline(data_preprocessor, lr, knn, Boosted=True, to_tensor=True)
    
#     # Index dates.
#     date_train, date_valid = dates[train_index], dates[valid_index]

#     # Selecting data for y_train and y_valid.
#     y_train = y.loc[date_train]
#     y_valid = y.loc[date_valid]
    
#     # Selecting data for X_train and X_valid.
#     X_train = X_C.loc[date_train]
#     X_valid = X_C.loc[date_valid]
    
#     X_train = X_train.reset_index().sort_values(["store_nbr", "family", "date"])
#     X_valid = X_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#     X_train = X_train.set_index(["date"])
#     X_valid = X_valid.set_index(["date"])

#     y_train = y_train.reset_index().sort_values(["store_nbr", "family", "date"])
#     y_valid = y_valid.reset_index().sort_values(["store_nbr", "family", "date"])
#     y_train = y_train.set_index(["date"])
#     y_valid = y_valid.set_index(["date"])


#     # Fitting model.
#     model.fit(X_train, y_train)

#     # Create predictions for Trainning and Validation.
#     fit = model.predict(X_train)
#     pred = model.predict(X_valid)
    
#     # MSE for trainning and validation. 
#     train_msle.append(float(mean_squared_log_error(y_train["sales"], fit["sales"])))
#     valid_msle.append(float(mean_squared_log_error(y_valid["sales"], pred["sales"])))
    
#     print(f"Training RMSLE: {cp.sqrt(mean_squared_log_error(y_train.sales, fit.sales)):.3f}, Validation RMSLE: {cp.sqrt(mean_squared_log_error(y_valid.sales, pred.sales)):.3f}")

# # Returns the square root of the average of the MSE.
# print("Average Across Folds")
# print(f"Training RMSLE:{np.sqrt(np.mean(train_msle)):.3f}, Validation RMSLE: {np.sqrt(np.mean(valid_msle)):.3f}")

# e_6 = 1/np.sqrt(np.mean(valid_msle))

In [None]:
# # Fit Model
# model = Hybrid_Pipeline(data_preprocessor, lr, xfvsgb, Boosted=True, to_tensor=False)
# model.fit(X_C, y)

# X_test_C = X_test_C[X_test_C.columns[variables]]
# pred_7 = model.predict(X_test_C)

## Linear Regression, LSTM Regressor, Boosted

In [None]:
# # Fit Model
# model = Hybrid_Pipeline(data_preprocessor, lr, xgb, Boosted=True, to_tensor=False)
# model.fit(X_C, y)

# X_test_C = X_test_C[X_test_C.columns[variables]]
# pred_8 = model.predict(X_test_C)

## Final Predictions and Submission 

In [None]:
# Generate Predictions
e_sum = e_1+e_2+e_3+e_4+e_5+e_6
#+e_7+e_8

ensembled_pred = pred_1*e_1/e_sum + pred_2*e_2/e_sum + pred_3*e_3/e_sum + pred_4*e_4/e_sum + pred_5*e_5/e_sum + pred_6*e_6/e_sum
#+ pred_7*e_7/e_sum + pred_8*e_8/e_sum
pred.head()

In [None]:
ensembled_pred.to_csv('submission.csv', index=True)

In [None]:
api.competition_submit('submission.csv','1st API Submission','store-sales-time-series-forecasting')