In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import matplotlib.pyplot as plt


import pandas as pd
from sklearn.preprocessing import OneHotEncoder



# STEP 2: Holidays Dataset
<hr/>

In [60]:
df_train = pd.read_csv('./Data/step1/df_train.csv')
df_test = pd.read_csv('./Data/step1/df_test.csv')
df_submission = pd.read_csv('./Data/step1/df_submission.csv')
holidays_events = pd.read_csv('./Data/kaggle/holidays_events.csv.gz')

### National Holidays - Feature Engineering and Analysis

In [61]:
holidays_events['date'] = pd.to_datetime(holidays_events['date'])
df_train['date'] = pd.to_datetime(df_train['date'])
df_test['date'] = pd.to_datetime(df_test['date'])
df_submission['date'] = pd.to_datetime(df_submission['date'])

holidays_events_fil = holidays_events[~holidays_events['transferred']]
df_train_hl = df_train.merge(holidays_events_fil, on='date', how='left')
df_train_hl = df_train_hl.fillna('No Holiday')

df_test_hl = df_test.merge(holidays_events_fil, on='date', how='left')
df_test_hl = df_test_hl.fillna('No Holiday')

df_submission_hl = df_submission.merge(holidays_events_fil, on='date', how='left')
df_submission_hl = df_submission_hl.fillna('No Holiday')

In [62]:
df_train_hl.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,year,month,day_of_week,in_store_list,day_off_store,type,locale,locale_name,description,transferred
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,2013,1,Tuesday,1,1.0,Holiday,National,Ecuador,Primer dia del ano,False
1,1,2013-01-01,1,BABY CARE,0.0,0,2013,1,Tuesday,0,1.0,Holiday,National,Ecuador,Primer dia del ano,False
2,2,2013-01-01,1,BEAUTY,0.0,0,2013,1,Tuesday,1,1.0,Holiday,National,Ecuador,Primer dia del ano,False
3,3,2013-01-01,1,BEVERAGES,0.0,0,2013,1,Tuesday,1,1.0,Holiday,National,Ecuador,Primer dia del ano,False
4,4,2013-01-01,1,BOOKS,0.0,0,2013,1,Tuesday,1,1.0,Holiday,National,Ecuador,Primer dia del ano,False


In [6]:
df_submission.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [7]:
df_test.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,2596374,2017-01-01,1,AUTOMOTIVE,0.0,0
1,2596375,2017-01-01,1,BABY CARE,0.0,0
2,2596376,2017-01-01,1,BEAUTY,0.0,0
3,2596377,2017-01-01,1,BEVERAGES,0.0,0
4,2596378,2017-01-01,1,BOOKS,0.0,0


- Merge holidays and training set for studing

In [63]:
def create_national_vars(df, locale):
    
    df[f'is_{locale}_holiday'] = (df['locale'] == locale).astype(int)
    date_list_national = df[df[f'is_{locale}_holiday'] == 1]['date'].unique()
    df['day_after_national_holiday'] = df['date'].apply(
        lambda x: (x - pd.Timedelta(days=1)) in date_list_national
    ).astype(int)

    df['day_before_national_holiday'] = df['date'].apply(
        lambda x: (x + pd.Timedelta(days=1)) in date_list_national
    ).astype(int)
    return df

In [64]:
df_train_hl = create_national_vars(df_train_hl, 'National')
df_test_hl = create_national_vars(df_test_hl, 'National')
df_submission_hl = create_national_vars(df_submission_hl, 'National')

- Create binary feature to track national holidays and the days near it: after/before

### Local and Regional Holidays - Feature Engineering and Analysis

In [65]:
df_train_hl['is_Local_holiday'] = (df_train_hl['locale'] == 'Local').astype(int)
df_train_hl['is_Regional_holiday'] = (df_train_hl['locale'] == 'Regional').astype(int)

df_test_hl['is_Local_holiday'] = (df_test_hl['locale'] == 'Local').astype(int)
df_test_hl['is_Regional_holiday'] = (df_test_hl['locale'] == 'Regional').astype(int)

df_submission_hl['is_Local_holiday'] = (df_submission_hl['locale'] == 'Local').astype(int)
df_submission_hl['is_Regional_holiday'] = (df_submission_hl['locale'] == 'Regional').astype(int)

- Create a binary variable for local and regional

In [66]:
from scipy.stats import ttest_ind
def check_holidays(df, locale):    
    results = []
    for store_id, store_data in df.groupby('store_nbr'):
        store_data = store_data.sort_values(by='date')
        print('store: ', store_id)
        # Identify holiday dates for the store
        holidays = holidays_events_fil.loc[holidays_events_fil['locale'] == locale, ['date', 'description']]
        
        for i, holiday in holidays.iterrows():
            holiday_date = holiday['date']
            holiday_description = holiday['description']
            # Get sales for the holiday, the day before, and the day after
            holiday_sales = store_data[(store_data['date'] == holiday_date)][['date', 'sales']]
            
            # Get non-holiday sales for the same month
            same_year = holiday_date.year
            same_month = holiday_date.month
            same_day_of_week = holiday_date.day_of_week
            
            
            non_holiday_sales = store_data[(store_data['date'].dt.month == same_month) &
                                        (store_data['date'].dt.year == same_year) &
                                        (store_data['date'].dt.day_of_week == same_day_of_week) &
                                        (~store_data[f'is_{locale}_holiday']) &
                                        (store_data['date'] != holiday_date)][['date', 'sales']]
            mean_holiday_sales = holiday_sales.groupby('date')['sales'].sum().mean()
            mean_non_holiday_sales = non_holiday_sales.groupby('date')['sales'].sum().mean()
            std_non_holiday_sales = non_holiday_sales.groupby('date')['sales'].sum().std()
                
            results.append({
                'store_id': store_id,
                'holiday_date': holiday_date,
                'holiday_description': holiday_description, 
                'mean_holiday_sales': mean_holiday_sales,
                'mean_non_holiday_sales': mean_non_holiday_sales,
                'std_non_holiday_sales': std_non_holiday_sales,
                'consider': mean_holiday_sales > mean_non_holiday_sales + std_non_holiday_sales,
                'locale': locale
            })
    

    results_df = pd.DataFrame(results)
    return results_df

- This function checks whether local and regional holidays have affected store sales. 
- This is done because we don't know which stores the local and regional holidays apply to. 
- The idea is to see if sales on the day of the holiday were 1 standard deviation above the average sales on the same days of the week in the same month.

In [67]:
# results_train_local = check_holidays(df_train_hl, 'Local')
# results_train_regional = check_holidays(df_train_hl, 'Regional')
# results_train = pd.concat([results_train_local, results_train_regional])

# results_grouped = results_train.groupby(['store_id', 'holiday_description', 'locale'])['consider'].max().reset_index()

# results_grouped.to_csv('./Data/step1/check_holidays.csv', index=False)

results_grouped = pd.read_csv('./Data/step2/check_holidays.csv')
results_grouped.head()

Unnamed: 0,store_id,holiday_description,locale,consider
0,1,Cantonizacion de Cayambe,Local,False
1,1,Cantonizacion de El Carmen,Local,True
2,1,Cantonizacion de Guaranda,Local,False
3,1,Cantonizacion de Latacunga,Local,True
4,1,Cantonizacion de Libertad,Local,False


In [68]:
df_train_hl_valid = df_train_hl.merge(results_grouped, left_on=['description', 'store_nbr', 'locale'], right_on=['holiday_description', 'store_id', 'locale'], how='left')
df_test_hl_valid = df_test_hl.merge(results_grouped, left_on=['description', 'store_nbr', 'locale'], right_on=['holiday_description', 'store_id', 'locale'], how='left')
df_submission_hl_valid = df_submission_hl.merge(results_grouped, left_on=['description', 'store_nbr', 'locale'], right_on=['holiday_description', 'store_id', 'locale'], how='left')

df_train_hl_valid['is_Local_holiday_real'] = (df_train_hl_valid['is_Local_holiday'] & df_train_hl_valid['consider']).astype(int)
df_train_hl_valid['is_Regional_holiday_real'] = (df_train_hl_valid['is_Regional_holiday'] & df_train_hl_valid['consider']).astype(int)

df_test_hl_valid['is_Local_holiday_real'] = (df_test_hl_valid['is_Local_holiday'] & df_test_hl_valid['consider']).astype(int)
df_test_hl_valid['is_Regional_holiday_real'] = (df_test_hl_valid['is_Regional_holiday'] & df_test_hl_valid['consider']).astype(int)


df_submission_hl_valid['is_Local_holiday_real'] = (df_submission_hl_valid['is_Local_holiday'] & df_submission_hl_valid['consider']).astype(int)
df_submission_hl_valid['is_Regional_holiday_real'] = (df_submission_hl_valid['is_Regional_holiday'] & df_submission_hl_valid['consider']).astype(int)

## Encoding

In [69]:

def encode_one_how(X_train, X_test, X_submission, categorical_cols):
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(X_train[categorical_cols])

    # Transform the training and test data
    train_encoded_array = encoder.transform(X_train[categorical_cols])
    test_encoded_array = encoder.transform(X_test[categorical_cols])
    submission_encoded_array = encoder.transform(X_submission[categorical_cols])
    

    # Create DataFrames with the encoded data
    train_encoded_df = pd.DataFrame(train_encoded_array, columns=encoder.get_feature_names_out(categorical_cols), index=X_train.index)
    test_encoded_df = pd.DataFrame(test_encoded_array, columns=encoder.get_feature_names_out(categorical_cols), index=X_test.index)
    submission_encoded_df = pd.DataFrame(submission_encoded_array, columns=encoder.get_feature_names_out(categorical_cols), index=X_submission.index)
    

    # Concatenate the original DataFrame (excluding the original categorical columns) with the encoded DataFrame
    X_train_encoded = pd.concat([X_train.drop(columns=categorical_cols), train_encoded_df], axis=1)
    X_test_encoded = pd.concat([X_test.drop(columns=categorical_cols), test_encoded_df], axis=1)
    X_submission_encoded = pd.concat([X_submission.drop(columns=categorical_cols), submission_encoded_df], axis=1)
    
    return X_train_encoded, X_test_encoded, X_submission_encoded
        

In [70]:
df_train_en_1 = df_train_hl_valid.groupby(['store_nbr', 'date', 'family'])[
    [
        'is_National_holiday', 
        'day_after_national_holiday',
       'day_before_national_holiday',
       'is_Local_holiday',
       'is_Regional_holiday',
       'is_Local_holiday_real', 
       'is_Regional_holiday_real'
       
    ]
].max().reset_index()

df_test_en_1 = df_test_hl_valid.groupby(['store_nbr', 'date', 'family'])[
    [
            'is_National_holiday', 
        'day_after_national_holiday',
       'day_before_national_holiday',
       'is_Local_holiday',
       'is_Regional_holiday',
       'is_Local_holiday_real', 
       'is_Regional_holiday_real'
    ]
].max().reset_index()

df_submission_en_1 = df_submission_hl_valid.groupby(['store_nbr', 'date', 'family'])[
    [
        'is_National_holiday', 
        'day_after_national_holiday',
       'day_before_national_holiday',
       'is_Local_holiday',
       'is_Regional_holiday',
       'is_Local_holiday_real', 
       'is_Regional_holiday_real'
    ]
].max().reset_index()

In [71]:
df_train = pd.concat([df_train, df_train_en_1.drop(columns=['store_nbr', 'date', 'family'])], axis=1)
df_test = pd.concat([df_test, df_test_en_1.drop(columns=['store_nbr', 'date', 'family'])], axis=1)
df_submission = pd.concat([df_submission, df_submission_en_1.drop(columns=['store_nbr', 'date', 'family'])], axis=1)

In [72]:
cols = [
    'store_nbr',
    'family', 
    'onpromotion', 
    'year', 
    'month', 
    'day_of_week', 
    'in_store_list', 
    'day_off_store', 
    'is_National_holiday', 
    'day_after_national_holiday',
    'day_before_national_holiday',
    'is_Local_holiday',
    'is_Regional_holiday',
    'is_Local_holiday_real', 
    'is_Regional_holiday_real'
]

In [73]:
X_train, y_train = df_train[cols], df_train['sales']
X_test, y_test = df_test[cols], df_test['sales']
X_submission = df_submission[cols + ['id']]

In [74]:
X_train_1, X_test_1, X_submission_1 = encode_one_how(X_train, X_test, X_submission, ['family', 'month', 'day_of_week'])

In [75]:
X_train_1.shape, X_test_1.shape, X_submission_1.shape

((2603502, 64), (404514, 64), (28512, 65))

In [76]:
X_train_1.isna().sum().sort_values(ascending=False)

store_nbr                     0
onpromotion                   0
family_LIQUOR,WINE,BEER       0
family_MAGAZINES              0
family_MEATS                  0
                             ..
family_HARDWARE               0
family_HOME AND KITCHEN I     0
family_HOME AND KITCHEN II    0
family_HOME APPLIANCES        0
day_of_week_Wednesday         0
Length: 64, dtype: int64

### Models Evaluation

In [77]:
# Train and evaluate model for each store
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))
rmsle_scorer = make_scorer(rmsle, greater_is_better=False)


#### Compare Models and Training Sets - One Store Prediction

In [78]:
# Define models
models = {
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': GradientBoostingRegressor(random_state=42)
}

sets = {
    'With National, Local and Regional Holidays': [
        X_train_1[X_train_1['store_nbr'] == 10].drop(columns='store_nbr'), 
        y_train[X_train_1['store_nbr'] == 10], 
        X_test_1[X_test_1['store_nbr'] == 10].drop(columns='store_nbr'), 
        y_test[X_test_1['store_nbr'] == 10]
    ]
    
}
def evaluate_models(models, sets):
    results = []

    for set_name, data in sets.items():
        X_train, y_train, X_test, y_test = data
        
        for model_name, model in models.items():
            # Train the model
            model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = model.predict(X_test)
            y_pred[y_pred < 0] = 0
            # Calculate Mean Squared Error
            score = rmsle(y_test, y_pred)
            
            # Append results
            results.append({
                'Set': set_name,
                'Model': model_name,
                'RMSLE': score
            })
    return pd.DataFrame(results)

In [79]:
results = evaluate_models(models, sets)
results

Unnamed: 0,Set,Model,RMSLE
0,"With National, Local and Regional Holidays",RandomForest,0.573349
1,"With National, Local and Regional Holidays",XGBoost,1.68104


In [80]:
results.to_csv('./Data/step2/results.csv')

- Results got worst with holidays addition, when compared to step1 results


#### Train for all Stores 

In [82]:


def train_per_store(stores, X_train, y_train, X_test, y_test):
    all_y_true = []
    all_y_pred_rf = []
    models = {}

    for store_nbr in stores:
        print('Store', store_nbr)
        X_train_st = X_train[X_train['store_nbr'] == store_nbr].drop(columns=['store_nbr'])
        X_test_st = X_test[X_test['store_nbr'] == store_nbr].drop(columns=['store_nbr'])
        
        y_train_st = y_train[X_train['store_nbr'] == store_nbr]
        y_test_st = y_test[X_test['store_nbr'] == store_nbr]
        
        # Train Random Forest Regressor
        rf = RandomForestRegressor(random_state=42)
        rf.fit(X_train_st, y_train_st) 
        # Make predictions
        y_pred_rf = rf.predict(X_test_st)
        # Append true values and predictions to the lists
        all_y_true.extend(y_test_st)
        all_y_pred_rf.extend(y_pred_rf)
        
        
        models[store_nbr] = {
            'rf': rf
        }
    
    total_rmsle_rf = rmsle(np.array(all_y_true), np.array(all_y_pred_rf))

    return {
        'Random Forest': total_rmsle_rf,
        'models': models
    }
        
def predict_per_store(models, X_submission, model_name='rf'):
    stores = X_submission['store_nbr'].unique()
    predictions = {'id': [], 'sales': []}
    for store in stores:
        X_submission_st = X_submission[X_submission['store_nbr'] == store]
        ids = X_submission_st['id']
        X_submission_st = X_submission_st.drop(columns=['id', 'store_nbr'])
        y_pred = models[store][model_name].predict(X_submission_st)

        predictions['id'] += list(ids)
        predictions['sales'] += list(y_pred)
        
    predictions_df = pd.DataFrame(predictions)
    return predictions_df
        


In [83]:
stores = df_train['store_nbr'].unique()
result = train_per_store(stores, X_train_1, y_train, X_test_1, y_test)
print('RMLSE with One hot', result['Random Forest'])

Store 1


Store 10
Store 11
Store 12
Store 13
Store 14
Store 15
Store 16
Store 17
Store 18
Store 19
Store 2
Store 20
Store 21
Store 22
Store 23
Store 24
Store 25
Store 26
Store 27
Store 28
Store 29
Store 3
Store 30
Store 31
Store 32
Store 33
Store 34
Store 35
Store 36
Store 37
Store 38
Store 39
Store 4
Store 40
Store 41
Store 42
Store 43
Store 44
Store 45
Store 46
Store 47
Store 48
Store 49
Store 5
Store 50
Store 51
Store 52
Store 53
Store 54
Store 6
Store 7
Store 8
Store 9
RMLSE with One hot 0.7520622232681453


In [84]:
second_submission_df = predict_per_store(result['models'], X_submission_1)
second_submission_df.to_csv('./Data/submssions/submission_step2.csv', index=False)

## Save Models

In [81]:
df_train.to_csv('./Data/step2/df_train.csv', index=False)
df_test.to_csv('./Data/step2/df_test.csv', index=False)
df_submission.to_csv('./Data/step2/df_submission.csv', index=False)