### I- Importation des librairies

In [45]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import gc
import lightgbm as lgb
import joblib
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

### II- Importation des données

In [46]:
%%time
calendar = pd.read_csv(r"C:\Users\naouf\OneDrive\Bureau\Datas\calendar.csv")
train_eva = pd.read_csv(r"C:\Users\naouf\OneDrive\Bureau\Datas\sales_train_evaluation.csv")
sell_prices = pd.read_csv(r"C:\Users\naouf\OneDrive\Bureau\Datas\sell_prices.csv")
sample_sub = pd.read_csv(r"C:\Users\naouf\OneDrive\Bureau\Datas\sample_submission.csv")

Wall time: 8.69 s


In [47]:
for d in range(1942,1970):
    col = 'd_' + str(d)
    train_eva[col] = 0
    train_eva[col] = train_eva[col].astype(np.int16)

#### Fonction permettant le downcast des données

In [48]:
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

In [49]:
train_eva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1975 entries, id to d_1969
dtypes: int16(28), int64(1941), object(6)
memory usage: 454.5+ MB


In [50]:
%%time
print("Downcasting data")
train_eva = downcast(train_eva)
sell_prices = downcast(sell_prices)
calendar = downcast(calendar)

Downcasting data
Wall time: 3min 11s


In [51]:
train_eva.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30490 entries, 0 to 30489
Columns: 1975 entries, id to d_1969
dtypes: category(6), int16(1317), int8(652)
memory usage: 97.1 MB


In [52]:
%%time
print("Melting data")
df = pd.melt(frame=train_eva, 
             id_vars=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"],
             var_name="d", value_name="sold")

Melting data
Wall time: 7.83 s


In [53]:
%%time
print("Merging data")
df = pd.merge(left=df, right=calendar, how="left", on="d")
df = pd.merge(left=df, right=sell_prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

Merging data
Wall time: 34.1 s


In [54]:
%%time
print("Implement features")
df["snap"] = df["snap_CA"] + df["snap_TX"] + df["snap_WI"]
df["snap"] = np.where(df["snap"] >= 1, 1, 0).astype(np.int8)

df["d"] = df["d"].str[2:].astype(np.int16)

df["sell_price"] = df['sell_price'].fillna(df.groupby('id')['sell_price'].transform('median'))

df["weekend"] = np.where(df["wday"] < 3, 1, 0).astype(np.int8)
df = df.drop(["date", "weekday", "wm_yr_wk", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI"], axis=1)

Implement features
Wall time: 24.7 s


In [55]:
print("Label Encoding")
d_id = dict(zip(df["id"].cat.codes, df["id"]))
df["id"] = df["id"].cat.codes
df["item_id"] = df["item_id"].cat.codes
df["dept_id"] = df["dept_id"].cat.codes
df["cat_id"] = df["cat_id"].cat.codes
df["store_id"] = df["store_id"].cat.codes
df["state_id"] = df["state_id"].cat.codes
df["event_name_1"] = df["event_name_1"].cat.codes
df["event_type_1"] = df["event_type_1"].cat.codes

Label Encoding


In [56]:
%%time
print("Mean Encoding")
df["state_mean"] = df.groupby("state_id")["sold"].transform("mean").astype(np.float16)
df["store_mean"] = df.groupby("store_id")["sold"].transform("mean").astype(np.float16)
df["cat_mean"] = df.groupby("cat_id")["sold"].transform("mean").astype(np.float16)
df["dept_mean"] = df.groupby("dept_id")["sold"].transform("mean").astype(np.float16)
df["state_cat_mean"] = df.groupby(["state_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["state_dept_mean"] = df.groupby(["state_id", "dept_id"])["sold"].transform("mean").astype(np.float16)
df["store_cat_mean"] = df.groupby(["store_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["store_cat_mean"] = df.groupby(["dept_id", "cat_id"])["sold"].transform("mean").astype(np.float16)
df["item_id_mean"] = df.groupby("item_id")["sold"].transform("mean").astype(np.float16)
df["item_state_mean"] = df.groupby(["item_id", "state_id"])["sold"].transform("mean").astype(np.float16)
df["item_store_mean"] = df.groupby(["item_id", "store_id"])["sold"].transform("mean").astype(np.float16)

Mean Encoding
Wall time: 40.7 s


In [57]:
%%time
print("Calulating Lags")
lags = [29,30,31,32,33,34,35,40,55,60,65,180]
for lag in lags:
    df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)

Calulating Lags
Wall time: 2min 21s


In [58]:
df = df[df['d']>max(lags)]

In [59]:
df.to_pickle('data.pkl')
del df, calendar, sell_prices, train_eva
gc.collect()

0

In [60]:
# Build model GBM
def lightgbm_model(X_train, y_train, X_valid, y_valid):
    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        num_leaves=224,
        min_child_weight=300
    )
        
    model.fit(X_train, y_train, 
          eval_set=[(X_train,y_train),(X_valid,y_valid)], 
          verbose=20, 
          early_stopping_rounds=20,
          eval_metric='rmse')
    
    return model

def xgboost_model(X_train, y_train, X_valid, y_valid):
    model = XGBRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        min_child_weight=300
    )

    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              verbose=20,
              early_stopping_rounds=20,
              eval_metric='rmse')

    return model

def catboost_model(X_train, y_train, X_valid, y_valid):
    
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bylevel=0.8,
        depth=8,
        l2_leaf_reg=224,
        min_child_samples=300,
        eval_metric='RMSE'
    )
    
    model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
              verbose=20, early_stopping_rounds=20)
    
    return model

In [61]:
%%time
data = pd.read_pickle('data.pkl')
valid = data[(data['d']>=1914) & (data['d']<1942)][['id','d','sold']]
test = data[data['d']>=1942][['id','d','sold']]

Wall time: 2.32 s


In [62]:
%%time
for i in range(10):
    df = data[data["store_id"] == i]    
    #Create train set
    X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
    X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
    X_test = df[df["d"] >= 1942].drop("sold", axis=1)
       
    # Create model
    print(f"Train model for store {i}")
    print("--------")
    
    model = lightgbm_model(X_train, y_train, X_valid, y_valid)
    #model = xgboost_model(X_train, y_train, X_valid, y_valid)
    #model = catboost_model(X_train, y_train, X_valid, y_valid)
    
    print("--------")
    print(f"Predicting for store {i}")    
    # Validation predict
    pred_val = model.predict(X_valid)
    valid.loc[X_valid.index, "sold"] = pred_val
    pred_eva = model.predict(X_test)
    test.loc[X_test.index, "sold"] = pred_eva   
    print("--------")
    
    print("Saving model and clear memories")  
    print("--------")
    filename = f'model_store_{i}.pkl'
    # Save model and Clear memmory
    joblib.dump(model, filename)
    del model, X_train, y_train, X_valid, y_valid, X_test
    gc.collect()

Train model for store 0
--------
[20]	training's rmse: 2.50232	training's l2: 6.26161	valid_1's rmse: 2.29141	valid_1's l2: 5.25057
[40]	training's rmse: 2.48014	training's l2: 6.15111	valid_1's rmse: 2.29437	valid_1's l2: 5.26414
[60]	training's rmse: 2.45214	training's l2: 6.013	valid_1's rmse: 2.28593	valid_1's l2: 5.22547
--------
Predicting for store 0
--------
Saving model and clear memories
--------
Train model for store 1
--------
[20]	training's rmse: 1.88199	training's l2: 3.54189	valid_1's rmse: 2.09477	valid_1's l2: 4.38808
[40]	training's rmse: 1.85842	training's l2: 3.45374	valid_1's rmse: 2.08817	valid_1's l2: 4.36046
--------
Predicting for store 1
--------
Saving model and clear memories
--------
Train model for store 2
--------
[20]	training's rmse: 3.56822	training's l2: 12.7322	valid_1's rmse: 2.81766	valid_1's l2: 7.93922
--------
Predicting for store 2
--------
Saving model and clear memories
--------
Train model for store 3
--------
[20]	training's rmse: 1.43526	

In [63]:
sample_sub = sample_sub[["id"]]
f_col = [f"F{i}" for i in range(1,29)]
f_col.insert(0, "id")

valid["id"] = valid["id"].map(d_id)
valid = valid.pivot(index="id", columns="d", values="sold").reset_index()
valid["id"] = valid["id"].str.replace("evaluation", "validation")
out_val = pd.merge(left=sample_sub[:30490], right=valid, on="id")
out_val.columns=f_col

test["id"] = test["id"].map(d_id)
test = test.pivot(index="id", columns="d", values="sold").reset_index()
out_eva = pd.merge(left=sample_sub[30490:], right=test, on="id")
out_eva.columns=f_col

submit = pd.concat([out_val,out_eva], ignore_index=True)

In [64]:
submit

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.786135,0.728051,0.527789,0.406478,0.463081,0.540610,0.575017,0.483143,0.384837,...,0.432239,0.617943,1.107375,0.606690,0.510896,0.421307,0.472486,0.788749,0.776618,0.753751
1,HOBBIES_1_002_CA_1_validation,0.503977,0.407383,0.359599,0.376526,0.367939,0.547303,0.429715,0.275735,0.280935,...,0.231927,0.345218,0.281747,0.230204,0.184587,0.242087,0.310907,0.334031,0.408080,0.370977
2,HOBBIES_1_003_CA_1_validation,0.319535,0.234652,0.227216,0.235839,0.336530,0.263868,0.270713,0.304734,0.207785,...,0.385486,0.629181,0.555395,0.563939,0.445006,0.484341,0.526423,0.501844,0.483560,0.390987
3,HOBBIES_1_004_CA_1_validation,2.822900,2.327582,1.802434,1.908198,1.634016,3.275526,2.752721,1.742862,1.747454,...,2.126269,3.017091,2.932041,2.217398,2.064761,1.876366,1.635029,2.111130,3.148377,3.708240
4,HOBBIES_1_005_CA_1_validation,1.020217,0.964287,0.731538,0.811685,1.260453,1.875267,1.699542,1.467668,0.911569,...,1.178146,1.383973,1.368659,0.992045,0.805172,0.904184,0.893554,0.947279,1.446944,1.448625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.448699,0.399818,0.531386,0.414390,0.549958,0.759865,0.710720,0.436329,0.506145,...,0.534572,0.650440,0.678134,0.599150,0.520929,0.603879,0.528376,0.596878,0.610829,1.019984
60976,FOODS_3_824_WI_3_evaluation,0.260568,0.348292,0.360092,0.368051,0.471454,0.585071,0.600402,0.330000,0.420239,...,0.240936,0.283336,0.448888,0.264552,0.385552,0.251743,0.219005,0.365051,0.326021,0.417642
60977,FOODS_3_825_WI_3_evaluation,0.687490,0.626359,0.720826,0.694032,0.666852,0.822224,1.023392,0.728530,0.819418,...,1.234014,1.084901,1.134413,1.103861,1.121025,0.875806,0.597211,0.684677,0.917394,1.143785
60978,FOODS_3_826_WI_3_evaluation,0.739777,0.761018,0.884794,0.725942,1.116817,0.862807,1.135681,0.716268,0.749625,...,0.717093,1.099658,0.713096,0.657718,0.673710,0.828179,0.704058,0.837016,1.302728,0.843008


In [65]:
submit.to_csv('submission.csv',index=False)