In [107]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import set_config
from feature_engine.datetime import DatetimeFeatures
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
import seaborn as sns
import numpy as np
import warnings

In [108]:
warnings.filterwarnings('ignore')
set_config(transform_output="pandas")

In [109]:
target_column = "num_sold"

In [110]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 150)

In [111]:
def drop_missing(data):
    data = data.dropna()
    return data

In [112]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train = drop_missing(train)

data = pd.concat([train, test]).reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319809 entries, 0 to 319808
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        319809 non-null  int64  
 1   date      319809 non-null  object 
 2   country   319809 non-null  object 
 3   store     319809 non-null  object 
 4   product   319809 non-null  object 
 5   num_sold  221259 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 14.6+ MB


In [113]:
data['date'] = pd.to_datetime(data['date'])

In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319809 entries, 0 to 319808
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   id        319809 non-null  int64         
 1   date      319809 non-null  datetime64[ns]
 2   country   319809 non-null  object        
 3   store     319809 non-null  object        
 4   product   319809 non-null  object        
 5   num_sold  221259 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 14.6+ MB


In [115]:
def date_features(data):
    dtf = DatetimeFeatures(features_to_extract = ["month", "year", "week", "day_of_week", "day_of_month", "day_of_year", "weekend", "month_start", "month_end", "year_start", "year_end", "quarter_start", "quarter_end", "quarter"],drop_original = False)
    data = dtf.fit_transform(data)

    data["days_since_start"] = (data["date"] - data.iloc[0].date).dt.days
    data = data.drop('date', axis=1)
    return data

data = date_features(data)

In [116]:
data["country"] = data["country"].astype('category')
data["store"] = data["store"].astype('category')
data["product"] = data["product"].astype('category')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319809 entries, 0 to 319808
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   id                  319809 non-null  int64   
 1   country             319809 non-null  category
 2   store               319809 non-null  category
 3   product             319809 non-null  category
 4   num_sold            221259 non-null  float64 
 5   date_month          319809 non-null  int32   
 6   date_year           319809 non-null  int32   
 7   date_week           319809 non-null  int64   
 8   date_day_of_week    319809 non-null  int32   
 9   date_day_of_month   319809 non-null  int32   
 10  date_day_of_year    319809 non-null  int32   
 11  date_weekend        319809 non-null  int64   
 12  date_month_start    319809 non-null  int64   
 13  date_month_end      319809 non-null  int64   
 14  date_year_start     319809 non-null  int64   
 15  date_year_end    

In [117]:
train_data = data[data[target_column].notna()]
test_data = data[data[target_column].isna()]
test_data = test_data.drop(target_column, axis=1)

test_id = test_data[['id']].copy()
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

In [118]:
X = train_data.drop(target_column, axis=1)
y = (train_data[target_column])

In [None]:
skf = TimeSeriesSplit(n_splits=6)

In [137]:
params = {
          'categorical_feature': 'name:country,store,product',
          'objective': 'regression', 
          'metric':'l1', 
          'n_jobs':-1,
          'seed': 42
}

scores = []
for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
    )

    scores.append(model.best_score['valid_0']['l1'])
print("Mean MAE score:", np.mean(scores))

Mean MAE score: 79.3284318179176


In [138]:
train_data = lgb.Dataset(X, label=y)

model = lgb.train(
        params,
        train_data
    )

In [139]:
def getPrediction(estimator, test_id, test_data, target):
    y_pred_submission = estimator.predict(test_data)
    
    test_submission = test_id
    test_submission[target] = y_pred_submission

    return test_submission

In [140]:
test_submission = getPrediction(model, test_id, test_data, target_column)
test_submission.to_csv("lgbm_gbdt_basic.csv", index=False)