In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
import lightgbm as lgb


In [4]:
data = pd.read_csv("../data/new/feature_dataset.csv")

data = data.sort_values(
    ["shop_id", "item_id", "date_block_num"]
).reset_index(drop=True)

print(data.shape)
data.head()


(7971687, 16)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_category_id_x,item_category_id_y,lag_1,lag_2,lag_3,item_avg,item_avg_lag_1,shop_month_avg,shop_month_avg_lag_1,cat_month_avg,cat_month_avg_lag_1,month_of_year
0,20,2,1,0.0,76,76,0.0,0.0,0.0,0.02381,0.02381,0.137869,0.143014,0.062254,0.061585,8
1,21,2,1,0.0,76,76,0.0,0.0,0.0,0.02381,0.02381,0.138933,0.137869,0.062988,0.062254,9
2,3,2,27,0.0,19,19,0.0,0.0,1.0,0.056834,0.056834,0.071823,0.09046,0.688007,1.093738,3
3,4,2,27,0.0,19,19,0.0,0.0,0.0,0.056834,0.056834,0.066315,0.071823,0.672761,0.688007,4
4,5,2,27,0.0,19,19,0.0,0.0,0.0,0.056834,0.056834,0.095305,0.066315,0.839524,0.672761,5


In [6]:
FEATURES = [
    "lag_1", "lag_2", "lag_3",
    "item_avg_lag_1",
    "shop_month_avg_lag_1",
    "cat_month_avg_lag_1",
    "month_of_year"
]

TARGET = "item_cnt_month"


In [9]:
#dividimos en set de train y de validación

train_df = data[data["date_block_num"] < 33]
valid_df = data[data["date_block_num"] == 33]

#separamos las variables independientes y la dependiente
X_train = train_df[FEATURES]
y_train = train_df[TARGET]

X_valid = valid_df[FEATURES]
y_valid = valid_df[TARGET]

print(X_train.shape, X_valid.shape)


(7791586, 7) (180101, 7)


In [12]:
model = lgb.LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [14]:
model.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.177792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 840
[LightGBM] [Info] Number of data points in the train set: 7791586, number of used features: 7
[LightGBM] [Info] Start training from score 0.246113


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [17]:
y_pred = model.predict(X_valid)
y_pred = np.clip(y_pred, 0, 20)


In [19]:
rmse_lgbm = np.sqrt(mean_squared_error(y_valid, y_pred))
rmse_lgbm


np.float64(0.6409438475737749)

In [20]:
importances = pd.Series(
    model.feature_importances_,
    index=FEATURES
).sort_values(ascending=False)

importances


cat_month_avg_lag_1     1863
item_avg_lag_1          1525
month_of_year           1390
lag_1                   1366
shop_month_avg_lag_1    1203
lag_2                    892
lag_3                    761
dtype: int32

In [22]:
import joblib

joblib.dump(model, "../modelos/lightgbm_model.joblib")
print("Modelo guardado")


Modelo guardado
