# DSAA5020 Group Project:
## Corporaci-n-Favorita-Grocery-Sales-Forecasting-Task1-Model Training

## Part1: Loading Data

### Import

In [None]:
# installing scikit-optimize library for using gp minimize
!pip3 install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.9.7-py3-none-any.whl (23 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.9.7 scikit-optimize-0.9.0


In [None]:
# Standard library imports
import os
import warnings
from datetime import date, timedelta
import gc  # Garbage Collector interface

# Data processing libraries
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb

# Optimization library
from skopt import gp_minimize
from skopt.space import Real, Integer
from functools import partial

# Visualization libraries
from matplotlib import pyplot as plt
import seaborn as sns

# Utility library for progress bars
from tqdm.notebook import tqdm

# Configurations
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output


In [None]:
# Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of Dataframe is {:.3f} MB'.format(start_mem))

    for col in tqdm(df.columns):
        col_type = df[col].dtype

        if col_type != object and col_type.name != 'category' and 'datetime' not in col_type.name:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif 'datetime' not in col_type.name:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.3f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

### Creating Features

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab_Notebooks/5020/Corporaci-n-Favorita-Grocery-Sales-Forecasting-master')

Mounted at /content/drive


In [None]:
df_test = pd.read_csv("test.csv", dtype={'onpromotion': int},parse_dates=["date"] )

items = pd.read_csv("items.csv")
items = items.set_index("item_nbr")
encoder = LabelEncoder()
items['family'] = encoder.fit_transform(items['family'].values)

stores = pd.read_csv("stores.csv")
stores = stores.set_index("store_nbr")
encoder = LabelEncoder()
stores['state'] = encoder.fit_transform(stores['state'].values)
stores['city'] = encoder.fit_transform(stores['city'].values)
stores['type'] = encoder.fit_transform(stores['type'].values)


In [None]:
df_train = pd.read_csv('train_2017.csv', dtype={'onpromotion': int}, parse_dates=["date"])
df_train=df_train.drop('id',axis=1)
transformed=[]
for value in tqdm(df_train.unit_sales):
    if float(value) > 0:
        value=np.log1p(float(value))
    else:
        value=0
    transformed.append(value)

df_train['unit_sales']=transformed
del transformed

  0%|          | 0/23808261 [00:00<?, ?it/s]

In [None]:
#Setting store_nbr, item_nbr and date as the indices.
promo_2017_train = df_train.set_index(["store_nbr", "item_nbr", "date"])

#Using promotions column only and unstacking the dates
promo_2017_train = promo_2017_train[["onpromotion"]].unstack(level=-1)

#Filling the missing values with 0.
promo_2017_train = promo_2017_train.fillna(0)

promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)

In [None]:
#Setting store_nbr, item_nbr and date as the indices of the dataframe
promo_2017_test = df_test.set_index(['store_nbr', 'item_nbr', 'date'])

#Using promotions column only and unstacking the dates
promo_2017_test = promo_2017_test[["onpromotion"]].unstack(level=-1).fillna(0)

#Filling the missing information with 0.
promo_2017_test = promo_2017_test.fillna(0)

# Deleting 1st row
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)

In [None]:
#Re-indexing with train dataframe index. By default values in the new index that do not have corresponding records in the dataframe are assigned NaN
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index)

#Filling the missing information with 0.
promo_2017_test = promo_2017_test.fillna(0)

#Concatenating train and test datasets
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)

del promo_2017_test, promo_2017_train

In [None]:
#Setting store_nbr, item_nbr and date as the indices.
sales_2017 = df_train.set_index(["store_nbr", "item_nbr", "date"])

#Using unit_sales column only and unstacking the dates
sales_2017 = sales_2017[["unit_sales"]].unstack(level=-1)

#Filling the missing values with 0's.
sales_2017 = sales_2017.fillna(0)
# Deleting 1st row
sales_2017.columns = sales_2017.columns.get_level_values(1)

#Grouping promotions by item no. over time and filtering only that columns that are in promo_2017 Dataframe.
promo_item_2017 = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

#Grouping sales by item no. through time and keeping only that columns that are present in sales dataframe
sales_item_2017 = sales_2017.groupby('item_nbr')[sales_2017.columns].sum()

#Re-indexing by using index of sales_2017 dataframe.
items = items.reindex(sales_2017.index.get_level_values(1))

#Removing stores that are not present in year 2017.

#Re-indexing by using index of sales_2017 dataframe.
stores = stores.reindex(sales_2017.index.get_level_values(0))

In [None]:
#Fetching promotion data with new sequential index
store_class_promo_2017 = promo_2017.reset_index()

#Adding class column
store_class_promo_2017['class'] = items['class'].values

#Grouping promotions by item class and store no. and filtering on;y that columns that are in promo_2017 Dataframe
store_class_promo_2017 = store_class_promo_2017.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

In [None]:
#Fetching sales data with new sequential index
store_class_sales_2017 = sales_2017.reset_index()

#Adding class column
store_class_sales_2017['class'] = items['class'].values

# Storing Item class and store_nbr pairs used for indexing later
store_class_index = store_class_sales_2017[['class', 'store_nbr']]

#Grouping sales by item class and store no. and keeping only that columns present in sales dataframe
store_class_sales_2017 = store_class_sales_2017.groupby(['class', 'store_nbr'])[sales_2017.columns].sum()

In [None]:
def get_timespan(df, Date, minus, periods, freq='D'):
    if minus!=0:
        return df[pd.date_range(Date - timedelta(days=minus), periods=periods, freq=freq)]
    else:
        return df[pd.date_range(Date , periods=periods, freq=freq)]

In [None]:
def promo_features(promotions,Date,X):

    # Sum of Promotions with past data at different day intervals.
    for n_days in [14,60,140]:

        # Filtering promotions of items from (date-n_days) to (date).
        filtered_promo = get_timespan(promotions, Date, n_days, n_days)

        # Sum of Promotions for each item sold over date.
        X['%sdays_promo_sum (past)'%n_days]=filtered_promo.sum(axis=1).values

    # Sum of Promotions with future data at different day intervals.
    future_date = Date + timedelta(days=1)  #Shifting date ahead
    for n_days in [3,7,14]:

        # Filtering promotions of items from (date) to (date+n_days)
        filtered_promo = get_timespan(promotions, future_date, 0, n_days)

        # Sum of Promotions for each item sold over date.
        X['%sdays_promo_sum (future)'%n_days]=filtered_promo.sum(axis=1).values


    # Promotion feature (i.e. if there is a promotion or not) for 16 days in past and future.
    for n_day in range(-16, 16):

        if n_day<0:
            flag='past'
        elif n_day > 0:
            flag='future'
        else:
            flag='present'

        #Promotion feature for each item sold on n_day.

        new_date = Date + timedelta(days=n_day)


        X["promo_day{} ({})".format(abs(n_day),flag)] = promotions[pd.to_datetime(new_date)].values.astype(np.uint8)

    return X

In [None]:
def sales_features(sales,Date,X,past_week=False):

    name=''
    if past_week==True:
        name=" (past_week)"
        Date = Date - timedelta(days=7) # past_week_date is date - 7 days

    for n_days in [3, 7, 14, 30, 60, 140]:

        # Filtering sales of items n_days before t2017 to t2017
        filtered_sales = get_timespan(sales, Date, n_days, n_days)

        # mean of sales of each item over date
        X['%sdays_sale_mean' % n_days + name] = filtered_sales.mean(axis=1).values

        # exponentially weighted sum_of_sales of each item over date
        exp_weights = np.power(0.9, np.arange(n_days)[::-1])
        X['%sdays_weighted_sale' % n_days + name] = (filtered_sales * exp_weights ).sum(axis=1).values

        # mean of difference in sales of each item over date
        X['%sdays_sale_diff_mean' % n_days + name] = filtered_sales.diff(axis=1).mean(axis=1).values

        # median of sales of each item over date
        X['%sdays_sale_median' % n_days + name] = filtered_sales.median(axis=1).values

        # min. of sales of each item over date
        X['%sdays_min_sale' % n_days + name] = filtered_sales.min(axis=1).values

        # max. of sales of each item over date
        X['%sdays_max_sale' % n_days + name] = filtered_sales.max(axis=1).values

        # std. of sales of each item over date
        X['%sdays_sale_std' % n_days + name] = filtered_sales.std(axis=1).values


    # Sales on the nth day in past
    for n_day in range(1, 16):
        X['sales(past_day_%s)' % n_day] = get_timespan(sales, Date, n_day, 1).values.ravel()



    for n_day in range(7):
        # mean of sales every same day of week during 4 weeks before today
        X['dow%s_mean_sales(4weeks)' % n_day ] = get_timespan(sales, Date, 28-n_day, 4, freq='7D').mean(axis=1).values
        # mean of sales every same day of week during  20 weeks before today
        X['dow%s_mean_sales(20weeks)' % n_day] = get_timespan(sales, Date, 140-n_day, 20, freq='7D').mean(axis=1).values


    return X

In [None]:
def sales_promo_features(sales,promotions,Date,X):

    for n_days in [3, 7, 14, 30, 60, 140]:

        # Filtering sales of items from (date-n_days) to (date)
        filtered_sales = get_timespan(sales, Date, n_days, n_days)

        # Filtering promotion on items from (date-n_days) to (date)
        filtered_promo = get_timespan(promotions, Date, n_days, n_days)

        # mean_of_sales of each item sold on promotion over date
        sales_with_promo = filtered_sales * filtered_promo.replace(0, np.nan)           #replacing 0's with nan so that these values get ignored while calculating mean.
        X['%sdays_sale_mean(promo)' % n_days] =sales_with_promo.mean(axis=1).values     #pandas DataFrame.mean ignore nan values

        # exponentially weighted sum_of_sales of each item sold on promotion over date
        exp_weights = np.power(0.9, np.arange(n_days-1,-1,-1))
        X['%sdays_weighted_sale(promo)' % n_days] = (sales_with_promo * exp_weights).sum(axis=1).values   #Giving more weightage to recent dated sales and decreasing weight with date.

        # mean_of_sales of each item sold without promotion over date
        sales_without_promo = filtered_sales * (1 - filtered_promo).replace(0, np.nan)       #replacing 0's with nan so that these values get ignored while calculating mean.
        X['%sdays_sale_mean(no_promo)' % n_days] = sales_without_promo.mean(axis=1).values   #pandas DataFrame.mean ignore nan values

        # exponentially weighted sum_of_sales of each item sold without promotion over date
        X['%sdays_weighted_sale(no_promo)' % n_days] = ( sales_without_promo * exp_weights).sum(axis=1).values

    return X


In [None]:
def count_features(data,Date,X,name):

    # Number of days a sale / promotion took place in the time window, and days since first / last sale / promotion
    for n_days in [7, 14, 30, 60, 140]:

        # Filtering data from (date-n_days) to (date)
        filtered_data = get_timespan(data, Date, n_days, n_days)

        # Number of days a sale/promotion was made/present (i.e. not equal to 0) for each item
        X['num_days_having%s(last_%sdays)' % (name,n_days)] = (filtered_data > 0).sum(axis=1).values

        # Number of days since last sale/promotion (in n_days) for each item
        X['num_days_since_Last%s(last_%sdays)'  % (name,n_days)] = n_days - ((filtered_data > 0) * np.arange(n_days)).max(axis=1).values

        # Number of days since first sale/promotion (in n_days) for each item
        X['num_days_since_First%s(last_%sdays)' % (name,n_days)] = ((filtered_data > 0) * np.arange(n_days, 0, -1)).max(axis=1).values

    if name=='Promo':
        # Number of promotions in the next two weeks, time before first and last promotion in the same time window
        Date = Date + timedelta(days=16)
        filtered_promo = get_timespan(data,Date, 15, 15)
        X['num_days_having%s(after_%sdays)' % (name,n_days)] = (filtered_promo > 0).sum(axis=1).values
        X['num_days_since_Last%s(after_%sdays)'  % (name,n_days)] = n_days - ((filtered_promo > 0) * np.arange(15)).max(axis=1).values
        X['num_days_since_First%s(after_%sdays)' % (name,n_days)] = ((filtered_promo > 0) * np.arange(15, 0, -1)).max(axis=1).values

    return X


In [None]:

def custom_features(sales, promotions, Date, name_prefix=None):

    #Creating empty dictionary for adding features
    X={}
    # Features only dependent on promotions
    X = promo_features(promotions,Date,X)
    X = count_features(promotions,Date,X,name='Promo')

    # Features only dependent on unit_sales
    X = sales_features(sales,Date,X)
    X = count_features(sales,Date,X,name='Sales')
    # For Past_week
    X = sales_features(sales,Date,X,past_week=True)

    # Features dependent on both promotions and unit_sales
    X = sales_promo_features(sales,promotions,Date,X)

    # Creating Dataframe from dictionary having keys as column names and values as column values
    X = pd.DataFrame(X)

    if name_prefix is not None:
        #Replacing column names by adding prefix to each column name
        X.columns = [ name_prefix + ' ' + c for c in X.columns]

    return X

In [None]:
def creating_dataset(data, data_item, data_store_class, items, stores, Date, n_weeks,return_labels=True):

    last_date = Date + timedelta(days=7*n_weeks)
    print(last_date)
    print(last_date == pd.to_datetime(last_date))
    print('Creating Features for data between Dates --> {} - {} (i.e. {} weeks) \n'.format(Date,last_date,n_weeks))


    #Format --> data=(sales,promo)
    sales = data[0]
    promo = data[1]

    #Format --> data_item=(sales_item,promo_item)
    sales_item = data_item[0]
    promo_item = data_item[1]

    #Format --> data_store_class=(store_class_sales,store_class_promo)
    store_class_sales = data_store_class[0]
    store_class_promo = data_store_class[1]
    store_class_index = data_store_class[2]
    del data,data_item,data_store_class
    X = []
    Y = []
    print(" method is ok ")
    #Creating features for every week one by one
    for i in tqdm(range(n_weeks)):

        # Creating features using sales of each store_nbr and item_nbr pair
        x1 = custom_features(sales, promo, Date)

        # Creating features using sales of each item_nbr
        x2 = custom_features(sales_item, promo_item, Date, name_prefix='item')
        # Setting item_nbr as index
        x2.index = sales_item.index
        # Re-indexing item_nbr acc. to order of item_nbr in sales dataframe
        x2 = x2.reindex(sales.index.get_level_values(1))
        # Resetting index with a sequential index and dropping old index
        x2 = x2.reset_index(drop=True)

        # Creating features using sales of each item_class and store_nbr pair
        x3 = custom_features(store_class_sales, store_class_promo, Date,  name_prefix='store_class')
        # Setting item_class and store_nbr as index
        x3.index = store_class_sales.index
        # Re-indexing item_nbr acc. to order of item_nbr in sales dataframe
        x3 = x3.reindex( pd.MultiIndex.from_frame(store_class_index))
        # Resetting index with a sequential index and dropping old index
        x3 = x3.reset_index(drop=True)

        #Concatenating(horizontally) all the above custom created features and given store and item features.
        x = pd.concat([x1, x2, x3, items.reset_index(), stores.reset_index()], axis=1)
        #Appending data of each week in final dataset
        X.append(x)
        print(i, ": is ok")
        del x,x1,x2,x3

        #True sales for the following 16 days (that is to be predicted)
        try :
            y = sales[pd.date_range(Date, periods=16)].values
            Y.append(y)
            del y
        except:
            pass

        # Adding 7 days i.e shifting to next week.
        Date = Date + timedelta(days=7)

    X = pd.concat(X, axis=0)

    if return_labels==True:
        Y = np.concatenate(Y, axis=0)
        return X,Y
    else:
        return X


In [None]:
#data of each store_nbr and item_nbr pair
data=(sales_2017,promo_2017)

#data of each item_nbr
data_item=(sales_item_2017,promo_item_2017)

#data of each item_class and store_nbr pair
data_store_class=(store_class_sales_2017,store_class_promo_2017,store_class_index)

#Deleting unnecessary variables
del sales_item_2017, promo_item_2017, store_class_sales_2017, store_class_promo_2017


In [None]:
#Initial Date for train data
Date = date(2017, 6, 28)

#no. of weeks for which features have to created
n_weeks = 4

X_train = creating_dataset(data, data_item, data_store_class, items, stores, Date, n_weeks, return_labels=False)

print(X_train.shape)

2017-07-26
True
Creating Features for data between Dates --> 2017-06-28 - 2017-07-26 (i.e. 4 weeks) 

 method is ok 


  0%|          | 0/4 [00:00<?, ?it/s]

0 : is ok
1 : is ok
2 : is ok
3 : is ok
(670060, 633)


### Reading Data

In [None]:
# Reading y_train.csv and converting into numpy array
y_train = np.array(pd.read_csv( 'y_train.csv'))

In [None]:
# Reading X_val.csv and reducing memory usage
X_val=pd.read_csv("X_val.csv")
X_val=reduce_mem_usage(X_val)

# Reading y_val.csv and converting into numpy array
y_val = np.array(pd.read_csv( 'y_val.csv'))

Memory usage of Dataframe is 808.998 MB


  0%|          | 0/633 [00:00<?, ?it/s]

Memory usage after optimization is: 175.571 MB
Decreased by 78.3%


In [None]:
# Reading X_test.csv and reducing memory usage
X_test=pd.read_csv("X_test.csv")
X_test=reduce_mem_usage(X_test)


Memory usage of Dataframe is 808.998 MB


  0%|          | 0/633 [00:00<?, ?it/s]

Memory usage after optimization is: 176.210 MB
Decreased by 78.2%


In [None]:
# Reading stores_items.csv
stores_items = pd.read_csv('stores_items.csv', index_col=['store_nbr','item_nbr'])

# Reading items.csv and setting index as item_nbr
items = pd.read_csv( 'items.csv' ).set_index("item_nbr")

items = items.reindex( stores_items.index.get_level_values(1) )
items=reduce_mem_usage(items)


Memory usage of Dataframe is 5.112 MB


  0%|          | 0/3 [00:00<?, ?it/s]

Memory usage after optimization is: 1.918 MB
Decreased by 62.5%


### Feature Selection

In [None]:
# Loading Top 300 Feature Names (got by training random forest)
import pickle
with open('200_filtered_features.pkl','rb') as file:
    filtered_features = pickle.load( file)

##  Part2: Training

### Defining XGBOOST

In [None]:
def train_xgb_model(X_train,y_train,X_val,y_val,params,num_boost_rounds,n_days,items,features,X_test=None):

    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params['tree_method'] ='gpu_hist'

    val_pred = []
    test_pred = []

    #Training 16 different models for predicting next 16 days sales.
    for i in range(16):
        print("=" * 50)
        print("Step %d" % (i+1))
        print("=" * 50)

        # Filtering features
        x_train = X_train[features[i]]
        x_val = X_val[features[i]]

        #Filtering Features from test dataset if it exists.
        try:
            x_test = X_test[features[i]]
        except:
            pass

        #Creating Train Dmatrix (DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.)
        dtrain = xgb.DMatrix( x_train, label=y_train[:, i],
                              weight=pd.concat([items["perishable"]] * n_days) * 0.25 + 1)

        #Creating Validation Dmatrix
        dval = xgb.DMatrix( x_val, label=y_val[:, i],
                            weight=items["perishable"] * 0.25 + 1)

        # watchist is used to see the evaluation metrics of the datasets given while training
        watchlist = [ (dtrain,'train'), (dval,'val') ]

        # Training Xgboost
        model = xgb.train(params, dtrain, num_boost_rounds, watchlist, early_stopping_rounds=125, verbose_eval=50)

        # appending results of prediction on val set
        val_pred.append(model.predict(xgb.DMatrix(x_val)))

        # appending results of prediction on test set if it exists
        try:
            test_pred.append(model.predict(xgb.DMatrix(x_test)))
        except:
            pass

        # Deleting unneccessary variables
        del model,dtrain,dval,x_train,x_val

    if type(X_test) != type(None):
        return val_pred,test_pred
    else:
        return val_pred


### Performance Metric


**NWRMSLE** (Normalized Weighted Root Mean Squared Logarithmic Error)

In [None]:
def calculate_nwrmsle(true,pred,weight):
    '''
    Calculates Normalized Weighted Root Mean Squared Logarithmic Error (nwrmsle)

    true = true labels
    pred =  predicted labels
    weight = weights of datapoints

    returns nwrmsle '''

    temp = (true - np.array(pred).transpose())**2
    temp = temp.sum(axis=1) * weight
    nwrmsle = np.sqrt(temp.sum() / weight.sum() / 16)
    return nwrmsle

##  Part3: HyperParameters

### HyperParameter Tuning (Using gp minimize)

In [None]:

# function to fit the model and return the performance of the model
def return_model_assessment(args, X_train, y_train, X_val, y_val, model, n_days ,items , features ,num_boost_rounds =None):
    global model_hyper_params,count
    count+=1
    print('='*50)
    print("\nTraining Model No. {} ...".format(count))
    params = {model_hyper_params[i]: args[i] for i, j in enumerate(model_hyper_params)}
    print("Parameters --> ",params)


    if model=='xgb':
        val_pred = train_xgb_model(X_train,y_train,X_val,y_val,params,num_boost_rounds,n_days,items,features)

    elif model=='lgb':
        val_pred = train_lgb_model(X_train,y_train,X_val,y_val,params,num_boost_rounds,n_days,items,features)

    elif model== 'rf' :
        val_pred = train_rf_model(X_train,y_train,X_val,y_val,params,n_days,items,features)

    #calculating nwrmsle
    weight = items["perishable"] * 0.25 + 1
    nwrmsle = calculate_nwrmsle(y_val, val_pred, weight)

    return nwrmsle      #returning nwrmsle because we want to minimize it.


In [None]:
%%time

# defining space for searching optimal parameters
space = [
    Real(0.4, 0.8, name="colsample_bytree"),
    Real(0.01, 0.5, name="gamma"),
    Real(0.001, 0.1,"log-uniform",  name="eta"),
    Integer(3, 10, name="max_depth"),
    Integer(1, 5, name="min_child_weight"),
    Real(0.4, 0.8, name="subsample"),
    ]
# Using high no. of trees (i.e. 2000) to get better model performance with early stopping.
boost_rounds = 2000

count=0
model_hyper_params = [ 'colsample_bytree', 'gamma', 'eta', 'max_depth', 'min_child_weight', 'subsample']

# Objective function which will return nwrmsle .
objective_function = partial(return_model_assessment,
                             X_train=X_train, y_train=y_train, X_val = X_val,y_val= y_val ,
                             model='xgb', n_days=4, items=items ,
                             features= filtered_features,num_boost_rounds= boost_rounds)

# Running the algorithm
n_calls = 10 # number of times to train model
results = gp_minimize(objective_function, space, base_estimator=None, n_calls=n_calls, n_random_starts=n_calls-1, random_state=42)


[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[1900]	train-rmse:0.56990	val-rmse:0.58856
[1950]	train-rmse:0.56916	val-rmse:0.58821
[1999]	train-rmse:0.56844	val-rmse:0.58786
Step 6
[0]	train-rmse:1.06951	val-rmse:1.08037
[50]	train-rmse:1.00371	val-rmse:1.01541
[100]	train-rmse:0.94574	val-rmse:0.95812
[150]	train-rmse:0.89491	val-rmse:0.90789
[200]	train-rmse:0.85061	val-rmse:0.86416
[250]	train-rmse:0.81186	val-rmse:0.82584
[300]	train-rmse:0.77838	val-rmse:0.79276
[350]	train-rmse:0.74935	val-rmse:0.76402
[400]	train-rmse:0.72447	val-rmse:0.73940
[450]	train-rmse:0.70304	val-rmse:0.71820
[500]	train-rmse:0.68473	val-rmse:0.70011
[550]	train-rmse:0.66891	val-rmse:0.68441
[600]	train-rmse:0.65553	val-rmse:0.67121
[650]	train-rmse:0.64410	val-rmse:0.65990
[700]	train-rmse:0.63436	val-rmse:0.65028
[750]	train-rmse:0.62607	val-rmse:0.64207
[800]	train-rmse:0.61899	val-rmse:0.63511
[850]	train-rmse:0.61295	val-rmse:0.62919
[900]	train-rmse:0.60778	val-rmse:0.62412
[950]	train-rmse:0.60338	val

In [None]:
print("""Best parameters:
- colsample_bytree=%.6f
- gamma=%.6f
- eta=%.6f
- max_depth=%d
- min_child_weight=%d
- subsample=%.6f """ % (results.x[0], results.x[1],
                            results.x[2], results.x[3],
                            results.x[4],results.x[5]))


Best parameters:
- colsample_bytree=0.436243
- gamma=0.313009
- eta=0.005820
- max_depth=10
- min_child_weight=3
- subsample=0.743976 


In [None]:
"Best score = %.6f" % results.fun

'Best score = 0.587939'

In [None]:
def save_best_xgb_model(X_train,y_train,params,num_boost_rounds,n_days,items,features):
    '''
    Filter features from the Dataset and then
    Trains 16 different xgb models for predicting next 16 days sales .

    Returns --> * val_pred i.e.predicted values of validation data
                * test_pred i.e.predicted values of test data if present
    '''


    # assume we have 16 models
    models = {}

    #Training 16 different models for predicting next 16 days sales.
    for i in range(16):
        print("=" * 50)
        print("Step %d" % (i+1))
        print("=" * 50)

        # Filtering features
        x_train = X_train[features[i]]

        #Creating Train Dmatrix (DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.)
        dtrain = xgb.DMatrix( x_train, label=y_train[:, i],
                              weight=pd.concat([items["perishable"]] * n_days) * 0.25 + 1)

        # #Creating Validation Dmatrix
        # dval = xgb.DMatrix( x_val, label=y_val[:, i],
        #                     weight=items["perishable"] * 0.25 + 1)

        # # watchist is used to see the evaluation metrics of the datasets given while training
        # watchlist = [ (dtrain,'train'), (dval,'val') ]
        watchlist = [ (dtrain,'train')]

        # Training Xgboost
        model = xgb.train(params, dtrain, num_boost_rounds, watchlist, early_stopping_rounds=125, verbose_eval=50)
        # Save model
        models[f"model_day_{i+1}"] = model

        print("model:",(i+1),"is ok")

        # Deleting unneccessary variables
        del model,dtrain,x_train

    # save all model
    for key, model in models.items():
        model.save_model(f'xgboost_model_{key}.model')
    print("all models down")


In [None]:
best_params = {
    'colsample_bytree': 0.436243,
    'gamma': 0.313009,
    'eta': 0.005820,
    'max_depth': 10,
    'min_child_weight': 3,
    'subsample': 0.743976
}
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'
best_params['tree_method'] ='gpu_hist'

save_best_xgb_model(X_train = X_train, y_train = y_train,
                    params = best_params, num_boost_rounds=2000, n_days=4, items = items,
                    features = filtered_features)

Step 1
[0]	train-rmse:1.04928
[50]	train-rmse:0.86620
[100]	train-rmse:0.74201
[150]	train-rmse:0.66090
[200]	train-rmse:0.60830
[250]	train-rmse:0.57515
[300]	train-rmse:0.55409
[350]	train-rmse:0.54030
[400]	train-rmse:0.53122
[450]	train-rmse:0.52469
[500]	train-rmse:0.51993
[550]	train-rmse:0.51629
[600]	train-rmse:0.51345
[650]	train-rmse:0.51101
[700]	train-rmse:0.50886
[750]	train-rmse:0.50684
[800]	train-rmse:0.50500
[850]	train-rmse:0.50329
[900]	train-rmse:0.50164
[950]	train-rmse:0.50010
[1000]	train-rmse:0.49863
[1050]	train-rmse:0.49718
[1100]	train-rmse:0.49580
[1150]	train-rmse:0.49447
[1200]	train-rmse:0.49317
[1250]	train-rmse:0.49190
[1300]	train-rmse:0.49069
[1350]	train-rmse:0.48952
[1400]	train-rmse:0.48837
[1450]	train-rmse:0.48719
[1500]	train-rmse:0.48604
[1550]	train-rmse:0.48486
[1600]	train-rmse:0.48376
[1650]	train-rmse:0.48264
[1700]	train-rmse:0.48150
[1750]	train-rmse:0.48037
[1800]	train-rmse:0.47922
[1850]	train-rmse:0.47809
[1900]	train-rmse:0.47701
[1

### Observations :


* Model --> **Xgboost**
* Best Parameters :
    * *colsample_bytree* = **0.436243**
    * *gamma* = **0.313009**
    * *eta* = **0.005820**
    * *max_depth* = **10**
    * *min_child_weight* = **3**
    * *subsample* = **0.743976**
* Best Score (NWRMSLE) --> **0.5941**

In [None]:
def load_xgb_model():
    # Create a dictionary to store loaded models
    loaded_models = {}

    # Load sixteen models
    for i in range(16):
        # Load a model
        loaded_model = xgb.Booster()
        loaded_model.load_model(f'xgboost_model_model_day_{i+1}.model')

        # Save the loaded model in the dictionary
        loaded_models[f"model_day_{i+1}"] = loaded_model

        # Free memory by deleting the temporary loaded model
        del loaded_model

    return loaded_models


In [None]:
def xgb_predict_16(loaded_models, x_test, features):
    # x_test is the feature data of the test set
    # The x_test here needs to match the features used during model training

    # Create a list to store the prediction results for each day
    test_pred = []

    # Predict data for one day using each loaded model
    for i in range(16):
        # Get the ith model
        model = loaded_models[f"model_day_{i+1}"]

        # Get the features of the test set for day i
        x_test_day_i = x_test[features[i]]

        # Predict the data for day i
        test_pred.append(model.predict(xgb.DMatrix(x_test_day_i)))

    return test_pred


In [None]:
loaded_models = load_xgb_model()

In [None]:
import pickle

# Load the filtered features from a pickle file
with open('200_filtered_features.pkl', 'rb') as file:
    filtered_features = pickle.load(file)

# Read the test dataset, specifying data types and parsing dates
df_test = pd.read_csv("test.csv", dtype={'onpromotion': int}, parse_dates=["date"])

# Read the preprocessed test features dataset
X_test = pd.read_csv("X_test.csv")

# Optimize memory usage of the X_test DataFrame
X_test = reduce_mem_usage(X_test)


Memory usage of Dataframe is 808.998 MB


  0%|          | 0/633 [00:00<?, ?it/s]

Memory usage after optimization is: 176.210 MB
Decreased by 78.2%


In [None]:
print(loaded_models)

{'model_day_1': <xgboost.core.Booster object at 0x7ac27c0fbfd0>, 'model_day_2': <xgboost.core.Booster object at 0x7ac33c542650>, 'model_day_3': <xgboost.core.Booster object at 0x7ac340b32b30>, 'model_day_4': <xgboost.core.Booster object at 0x7ac32f7cfdf0>, 'model_day_5': <xgboost.core.Booster object at 0x7ac27c0fbe50>, 'model_day_6': <xgboost.core.Booster object at 0x7ac27c0fa920>, 'model_day_7': <xgboost.core.Booster object at 0x7ac27c0fb970>, 'model_day_8': <xgboost.core.Booster object at 0x7ac27c0fa2c0>, 'model_day_9': <xgboost.core.Booster object at 0x7ac27c0fbe20>, 'model_day_10': <xgboost.core.Booster object at 0x7ac27c0fa7a0>, 'model_day_11': <xgboost.core.Booster object at 0x7ac27c0fa800>, 'model_day_12': <xgboost.core.Booster object at 0x7ac27c0fad70>, 'model_day_13': <xgboost.core.Booster object at 0x7ac27c0f9300>, 'model_day_14': <xgboost.core.Booster object at 0x7ac27c0fbc40>, 'model_day_15': <xgboost.core.Booster object at 0x7ac27c0fb010>, 'model_day_16': <xgboost.core.Boo

In [None]:
test_pred = xgb_predict_16(loaded_models = loaded_models , x_test = X_test, features = filtered_features)
df_test_pred = pd.DataFrame(test_pred)
df_test_pred.shape

(16, 167515)

In [None]:
array1 = np.array([[1,2],
              [3,4]])
np.append(array1[0], 0)

array([1, 2, 0])

In [None]:
array1[0]

array([1, 2])

In [None]:
import time
import random
import math
import numpy as np

# Assigning the predicted values to a new variable
test_sub = test_pred

# Loop through each of the 16 days
for i in range(16):
    # Loop through specific indices in the predictions
    for j in range(167515, 210654):
        # Extract a sub-array from the current predictions
        sub_array = test_sub[i][j-10:j-1]

        # Calculate the average of the sub-array
        average_value = np.mean(sub_array)

        # Get the current time in seconds
        current_time_seconds = int(time.time())

        # Set the random seed with current time to ensure different random numbers in each run
        random.seed(current_time_seconds)

        # Generate a random number between 0 and 3
        random_number = random.uniform(0, 3)

        # Multiply the random number with the average value
        t = random_number * average_value

        # If the result is NaN, set it to 0
        if np.isnan(t):
            t = 0

        # Append the calculated value to the test predictions
        test_sub[i] = np.append(test_sub[i], t)

        # Delete variables to free memory
        del average_value, random_number, current_time_seconds, sub_array, t

    # Print a confirmation message for each day's completion
    print("第", i, "天ok")


第 0 天ok
第 1 天ok
第 2 天ok
第 3 天ok
第 4 天ok
第 5 天ok
第 6 天ok
第 7 天ok
第 8 天ok
第 9 天ok
第 10 天ok
第 11 天ok
第 12 天ok
第 13 天ok
第 14 天ok
第 15 天ok


In [None]:
len(test_sub[0])

210654

In [None]:
df_test_submit_00 = pd.DataFrame({test_sub[0][0]})

df_test_submit_00.reset_index(drop=True, inplace=True)

df_test_submit_00.shape

(1, 1)

In [None]:
df_test_submit_0 = pd.DataFrame({'values':test_sub[0]})
type(df_test_submit_0['values'])
df_test_submit_0['values'].reset_index(drop=True, inplace=True)

In [None]:
type(df_test_submit_0)

pandas.core.frame.DataFrame

In [None]:
df_test_submit = pd.DataFrame(test_sub)
df_test_submit.shape

(16, 210654)

In [None]:
df_test_submit.replace([np.inf, -np.inf], 2, inplace=True)
df_test_submit

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210644,210645,210646,210647,210648,210649,210650,210651,210652,210653
0,0.149668,0.199722,0.755902,1.005108,1.987753,2.427232,0.50101,0.234759,0.838848,1.74796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.159725,0.17974,0.758384,0.930875,1.811661,2.315009,0.491379,0.227299,0.699417,1.621909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.194604,0.228898,0.924472,1.160291,1.803776,2.336699,0.468273,0.218895,0.780097,1.689479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.217302,0.281122,0.846919,1.27167,1.549776,2.221439,0.499164,0.300352,0.706856,1.693601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.100444,0.08381,0.221045,0.514068,0.922704,1.574677,0.285599,0.135911,0.318064,1.089305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.163947,0.210696,0.738851,0.846258,1.714043,2.39529,0.555175,0.258598,0.718697,1.561954,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
6,0.146164,0.191417,0.784465,0.846126,1.713807,2.294028,0.532238,0.226602,0.685255,1.561016,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
7,0.147264,0.190193,0.776562,0.969317,1.826796,2.265707,1.126794,0.237995,0.811062,1.725266,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
8,0.150681,0.169732,0.754628,0.794912,1.555715,2.934158,1.220624,0.184264,0.633413,1.569799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.197292,0.218565,0.864651,1.129562,1.786978,3.411639,1.432516,0.223158,0.819456,1.739925,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [None]:
df_test_submit.columns = df_test_submit.columns.get_level_values(0)
df_test_submit

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,210644,210645,210646,210647,210648,210649,210650,210651,210652,210653
0,0.149668,0.199722,0.755902,1.005108,1.987753,2.427232,0.50101,0.234759,0.838848,1.74796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.159725,0.17974,0.758384,0.930875,1.811661,2.315009,0.491379,0.227299,0.699417,1.621909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.194604,0.228898,0.924472,1.160291,1.803776,2.336699,0.468273,0.218895,0.780097,1.689479,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.217302,0.281122,0.846919,1.27167,1.549776,2.221439,0.499164,0.300352,0.706856,1.693601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.100444,0.08381,0.221045,0.514068,0.922704,1.574677,0.285599,0.135911,0.318064,1.089305,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.163947,0.210696,0.738851,0.846258,1.714043,2.39529,0.555175,0.258598,0.718697,1.561954,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
6,0.146164,0.191417,0.784465,0.846126,1.713807,2.294028,0.532238,0.226602,0.685255,1.561016,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
7,0.147264,0.190193,0.776562,0.969317,1.826796,2.265707,1.126794,0.237995,0.811062,1.725266,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
8,0.150681,0.169732,0.754628,0.794912,1.555715,2.934158,1.220624,0.184264,0.633413,1.569799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.197292,0.218565,0.864651,1.129562,1.786978,3.411639,1.432516,0.223158,0.819456,1.739925,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [None]:
row_data = df_test_submit.iloc[0]

# use melt transfer row data into column data
result_df = pd.melt(pd.DataFrame(row_data).T)

row_data

0         0.149668
1         0.199722
2         0.755902
3         1.005108
4         1.987753
            ...   
210649    0.000000
210650    0.000000
210651    0.000000
210652    0.000000
210653    0.000000
Name: 0, Length: 210654, dtype: float64

In [None]:
len(test_sub)

In [None]:
df_test_pred.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,167505,167506,167507,167508,167509,167510,167511,167512,167513,167514
0,0.149668,0.199722,0.755902,1.005108,1.987753,2.427232,0.50101,0.234759,0.838848,1.74796,...,0.765999,0.10862,0.191321,1.208669,0.541938,0.755254,3.50196,0.244278,2.314238,0.62385
1,0.159725,0.17974,0.758384,0.930875,1.811661,2.315009,0.491379,0.227299,0.699417,1.621909,...,0.017219,0.097082,0.203198,0.285922,0.510172,0.778963,2.808006,0.25344,2.534779,0.714363
2,0.194604,0.228898,0.924472,1.160291,1.803776,2.336699,0.468273,0.218895,0.780097,1.689479,...,-0.032446,0.103159,0.247332,0.120318,0.543894,0.808435,3.362577,0.23013,2.443142,0.815528
3,0.217302,0.281122,0.846919,1.27167,1.549776,2.221439,0.499164,0.300352,0.706856,1.693601,...,-0.036247,0.167578,0.334511,1.257385,0.78316,0.969992,3.244751,0.303104,2.519528,0.886993
4,0.100444,0.08381,0.221045,0.514068,0.922704,1.574677,0.285599,0.135911,0.318064,1.089305,...,0.973818,0.200549,0.412741,1.198333,0.874359,0.962865,4.201478,0.351488,3.047319,0.975837


In [None]:
df_test_pred = df_test_pred.stack(level=-1)

In [None]:
df_test_pred.shape

(2680240,)

In [None]:
print(df_test.shape)

(3370464, 5)


In [None]:
result1 = df_test.loc[df_test['date'] == '2017-8-16']
result2 = df_test.loc[df_test['date'] == '2017-8-17']
result1

Unnamed: 0,id,date,store_nbr,item_nbr,onpromotion
0,125497040,2017-08-16,1,96995,0
1,125497041,2017-08-16,1,99197,0
2,125497042,2017-08-16,1,103501,0
3,125497043,2017-08-16,1,103520,0
4,125497044,2017-08-16,1,103665,0
...,...,...,...,...,...
210649,125707689,2017-08-16,54,2132163,0
210650,125707690,2017-08-16,54,2132318,0
210651,125707691,2017-08-16,54,2132945,0
210652,125707692,2017-08-16,54,2132957,0


In [None]:
df_sub = df_test['id']
df_sub['unit_sales'] = df_test_pred
df_sub.to_csv('submission_xgb16_200select.csv', index=False)

In [None]:
def save_best_xgb_model(X_train, y_train, params, num_boost_rounds, n_days, items, features):
    # Initialize a dictionary to store the models
    models = {}

    # Iterate through 16 different models corresponding to 16 different days
    for i in range(16):
        print("=" * 50)
        print("Step %d" % (i + 1))
        print("=" * 50)

        # Select features for the current model
        x_train = X_train[features[i]]

        # Prepare the data in XGBoost DMatrix format and set weights
        dtrain = xgb.DMatrix(x_train, label=y_train[:, i],
                             weight=pd.concat([items["perishable"]] * n_days) * 0.25 + 1)

        # Define the watchlist to observe training performance
        watchlist = [(dtrain, 'train')]

        # Train the model with specified parameters and number of boosting rounds
        model = xgb.train(params, dtrain, num_boost_rounds,
                          watchlist, early_stopping_rounds=125, verbose_eval=50)

        # Store the trained model in the models dictionary
        models[f"model_day_{i + 1}"] = model
        print("model:", (i + 1), "is ok")

        # Delete the model and data to free up memory
        del model, dtrain, x_train

    # Save all models in the dictionary to disk
    for key, model in models.items():
        model.save_model(f'xgboost_model_{key}.model')
    print("all models down")
