## Gradient Booosting machine

In [3]:
!pip install numpy pandas matplotlib seaborn scikit-learn graphviz lightgbm xgboost lightgbm --quiet

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [5]:
ross_df = pd.read_csv('db/train.csv')
store_df = pd.read_csv('db/store.csv')
test_df = pd.read_csv('db/test.csv')
submission_df = pd.read_csv('db/sample_submission.csv')


  ross_df = pd.read_csv('db/train.csv')


In [6]:
merged_df = pd.merge(ross_df, store_df, on='Store', how='left')
merged_test_df = pd.merge(test_df, store_df, on='Store', how='left')

In [7]:
def split_date(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week

In [8]:
split_date(merged_df)
split_date(merged_test_df)

In [9]:
merged_df = merged_df[merged_df['Open'] != 0].copy()

1. Competition

In [10]:
def comp_months(df):
    df['competitionOpen'] = 12 * (df.Year - df.CompetitionOpenSinceYear) + (df.Month - df.CompetitionOpenSinceMonth)
    df['competitionOpen'] = df['competitionOpen'].map(lambda x: 0 if x < 0 else x).fillna(0)

In [11]:
comp_months(merged_df)
comp_months(merged_test_df)

2. Promo

In [12]:
def check_promo_month(row):
    month2Str = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun',
                 7:'Jul', 8:'Aug', 9:'Sept', 10:'Oct', 11:'Nov', 12:'Dec'}
    try:
        pi = row.get('PromoInterval', None)
        if pd.isna(pi) or not row.get('Promo2Open', 0):
            return 0
        months = [m.strip() for m in str(pi).split(',')]
        return 1 if month2Str.get(int(row['Month'])) in months else 0
    except Exception:
        return 0

In [13]:
def promo_cols(df):
    df['Promo2Open'] = 12 * (df.Year - df.Promo2SinceYear) + (df.Month - df.Promo2SinceWeek // 4)
    df['Promo2Open'] = df['Promo2Open'].map(lambda x: 0 if x < 0 else x).fillna(0)
    df['isPromoMonth'] = df.apply(check_promo_month, axis=1) * df.Promo2

In [18]:
input_cols = ['Store','DayOfWeek','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment', 'CompetitionDistance','competitionOpen','Day','Month','Year','WeekOfYear','Promo2','Promo2Open','isPromoMonth']
target_col ='Sales'

In [19]:
promo_cols(merged_df)
promo_cols(merged_test_df)


In [32]:
inputs =merged_df[input_cols].copy()
targets = merged_df[target_col].copy()
test_inputs = merged_test_df[input_cols].copy()


In [26]:
numeric_cols = ['Store','Promo','SchoolHoliday', 'CompetitionDistance','competitionOpen','Day','Month','Year','WeekOfYear','Promo2','Promo2Open','isPromoMonth']
categorical_cols = ['DayOfWeek','StateHoliday','StoreType','Assortment']

In [27]:
max_distance  = inputs['CompetitionDistance'].max()
inputs['CompetitionDistance'] = inputs['CompetitionDistance'].fillna(max_distance *2 , inplace=True)
merged_test_df['CompetitionDistance'] = merged_test_df['CompetitionDistance'].fillna(max_distance *2 , inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  inputs['CompetitionDistance'] = inputs['CompetitionDistance'].fillna(max_distance *2 , inplace=True)
  inputs['CompetitionDistance'] = inputs['CompetitionDistance'].fillna(max_distance *2 , inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_test_df['CompetitionD

$$Scaler 

In [33]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
inputs[numeric_cols] = scaler.fit_transform(inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])


In [36]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
for col in categorical_cols:
    inputs[col] = inputs[col].astype(str)
    test_inputs[col] = test_inputs[col].astype(str)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(inputs[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))



In [41]:
from xgboost import XGBRegressor
model = XGBRegressor(random_state=42, n_jobs=-1, n_estimators=20, max_depth=4)

In [None]:
model.fit(inputs, targets)

$$KFold

In [25]:
from sklearn.model_selection import KFold


In [None]:
from sklearn.metrics import mean_squared_error

def rmse(a, b):
    return mean_squared_error(a, b, squared=False)

In [None]:
def train_and_evaluate(X_train, train_targets, X_val, val_targets, **params):
    model = XGBRegressor(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, train_targets)
    train_rmse = rmse(model.predict(X_train), train_targets)
    val_rmse = rmse(model.predict(X_val), val_targets)
    return model, train_rmse, val_rmse

In [None]:
kfold = KFold(n_splits=5)

In [None]:
models = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
    X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
    model, train_rmse, val_rmse = train_and_evaluate(X_train, 
                                                     train_targets, 
                                                     X_val, 
                                                     val_targets, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    print('Train RMSE: {}, Validation RMSE: {}'.format(train_rmse, val_rmse))

function to average predication from the 5 different models 

In [None]:
import numpy as np

def predict_avg(models, inputs):
    return np.mean([model.predict(inputs) for model in models], axis=0)

Here's a helper function to test hyperparameters with K-fold cross validation.



In [None]:
def test_params_kfold(n_splits, **params):
    train_rmses, val_rmses, models = [], [], []
    kfold = KFold(n_splits)
    for train_idxs, val_idxs in kfold.split(X):
        X_train, train_targets = X.iloc[train_idxs], targets.iloc[train_idxs]
        X_val, val_targets = X.iloc[val_idxs], targets.iloc[val_idxs]
        model, train_rmse, val_rmse = train_and_evaluate(X_train, train_targets, X_val, val_targets, **params)
        models.append(model)
        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
    print('Train RMSE: {}, Validation RMSE: {}'.format(np.mean(train_rmses), np.mean(val_rmses)))
    return models