In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.3-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.3


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

In [4]:
import seaborn as sns

# Load data

In [5]:
np.random.seed(42)
test_path = './competitive-data-science-predict-future-sales/transformed_data/work_df_after_feature_engineering__test.parquet'
data_path = './competitive-data-science-predict-future-sales/transformed_data/work_df_after_feature_engineering__train.parquet'
data = pd.read_parquet(data_path) 
data = data[data.sub_cat != 25]
data = data[data.shop_id != 9]
print(len(data))
data.columns

FileNotFoundError: ./competitive-data-science-predict-future-sales/transformed_data/work_df_after_feature_engineering__train.parquet

In [None]:
sample = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv')

# Constants and cols

In [None]:
start_month, end_month = (24, 32)
predict_month = 28
train_size = 6
months = list(range(start_month,end_month))
cat_cols = [1,2,3]

num_cols = [#num cols]


# Functions (timesplit, cross-validation, grid search, data preparation, submisson )



In [None]:
def root_mean_squared_error(true, pred):
    return mean_squared_error(true, pred, squared=False)

### Cross - validation

In [None]:
from ctypes import ArgumentError
class TimeSeriesSplitter:
    def __init__ (self, X, dates, date_index, min_train_size, rolling = True):
        self.X = X
        self.dates = dates
        self.date_index = date_index
        self.min_train_size = min_train_size
        self.rolling = rolling

    def split(self):
        if self.min_train_size < 1:
            raise ArgumentError()
        prev = 0
        iters = ((len(self.dates) - self.min_train_size - 1)) + 1
        for i in range(iters):
            start = prev if self.rolling else 0
            end = prev + self.min_train_size
            months = self.dates[start : end]
            mask_train = np.isin(self.X[:, self.date_index], months)
            mask_test = self.X[:, self.date_index] == self.dates[end]
            X_train = np.where(mask_train)[0]
            X_test = np.where(mask_test)[0]
            prev += 1
            yield X_train, X_test

In [None]:
def cross_val_score(X, y, model, cat_fet, months, train_size,  metric=root_mean_squared_error, plot = False, transformer = None, trial = None):
    cv = TimeSeriesSplitter(X,months, 0, min_train_size=train_size).split()
    errors = []
    it = 0
    for X_train_ind, X_test_ind in cv:
        X_train, y_train, X_test, y_test = X[X_train_ind], y[X_train_ind], X[X_test_ind], y[X_test_ind]
        X_train_ind, X_test_ind = 0, 0
        if transformer:
            X_train = transformer.fit_transform(X_train)
            X_test = transformer.transform(X_test)
        model.fit(X_train, y_train)
        pr = model.predict(X_test)
        pr = [0 if x < 0 else x for x in pr ]
        errors.append(metric(y_test, pr))
        it+=1
    if plot:
        plt.title('Cross-validation errors')
        plt.plot(list(range(it)), errors)
        plt.ylabel('RMSE')
        plt.xlabel('Folds')
        plt.show()
    return errors

In [None]:
def validate(experiment, X, y, cat_cols, months, loss, train_size):
        mlflow.set_experiment(experiment)
        errors = train_mlflow(model, X, y, cat_cols, months, train_size, transformer)
        print('ERRORS:  ', errors)
        print('STD       MEAN       MEDIAN')
        print(np.std(errors), np.mean(errors), np.median(errors))
        lgb.plot_metric(model)

### Data processing

In [None]:
def prepare_num_data(data, min_block, max_block, cols, transformer=None):
    data = data[(data.date_block_num >= min_block) & (data.date_block_num <= max_block)]
    y = data['item_cnt_month'].to_numpy()
    y[y < 0] = 0
    data = data[cols]
    X = data.to_numpy()
    if transformer:
        transformer.fit(data)
        data = transformer.transform(data)
    return (X, y)

In [None]:
def make_sabmission(model, num_cols, subm_name, transformer = None):
    data = pd.read_parquet(test_path)
    test_data = pd.merge(test, data, on = ['shop_id', 'item_id'], how = 'left')
    test_data = test_data[num_cols]
    test_data = test_data.to_numpy()
    if transformer:
        test_data = transformer.transform(test_data)
    prediction = model.predict(test_data)
   
    sample['item_cnt_month'] = prediction
    sample.item_cnt_month = [x if x > 0 else 0 for x in sample.item_cnt_month]
    sample.to_csv(subm_name, sep = ',', index = False)
    return sample

In [None]:
class CombinationModel():
    def __init__(self, internet_model, cifr_model, model, internet_cols, city_index):
         self.i_model = internet_model
         self.c_model = cifr_model
         self.model = model
         self.i_cols = internet_cols
         self.city_index = city_index
         self.internet_cols = internet_cols
    

    def fit(self, X, y, categorical_feature ):
        mask = X[:, self.city_index]==7
        self.i_model.fit(X[mask], y[mask],categorical_feature=categorical_feature)
        mask = X[:, self.city_index]==27
        self.c_model.fit(X[mask], y[mask],categorical_feature=categorical_feature)
        mask = np.logical_not(np.isin(X[:, self.city_index], self.internet_cols))
        self.model.fit(X[mask],y[mask],categorical_feature=categorical_feature)

    def predict(self, X):
        prediction = np.zeros(len(X))
        mask = X[:, self.city_index] == 7
        pred = self.i_model.predict(X[mask])
        prediction[np.where(mask)[0]] = pred
        mask = X[:, self.city_index] == 27
        pred = self.c_model.predict(X[mask])
        prediction[np.where(mask)[0]] = pred
        mask = np.logical_not(np.isin(X[:, self.city_index], self.internet_cols))
        pred = self.model.predict(X[mask])
        prediction[np.where(mask)[0]] = pred
        mask = 0
        return prediction



### Param tuning

In [None]:
def suggest_params(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 300, step=10),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.4),
        "num_leaves": trial.suggest_int("num_leaves", 15, 200),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 50, 370, step=20),
        "reg_alpha": trial.suggest_float(
            "reg_alpha", 1, 5, step=0.5),
        "reg_lambda": trial.suggest_float(
            "reg_lambda", 1, 5, step=0.5),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.15),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        )}
    return param

# Train cycle

In [None]:
X, y = prepare_num_data(data, start_month, end_month, num_cols, transformer=None)
X_test, y_test  = prepare_num_data(data, start_month, end_month, num_cols, transformer=None)
data = 0

In [None]:
model = LGBMRegressor(bagging_fraction=0.8, feature_fraction=0.7, learning_rate=0.17, 
                      min_child_samples=100, n_estimators=180,
                      num_leaves=200, objective='huber', reg_alpha=3.5, reg_lambda=4)

In [None]:
cross_val_score(X, y, comb_model, None, months, train_size, plot=True)

In [None]:
params = (X, y, cat_cols, months, 'regression_l1', 4)
best_params = tune_params(15, *params)
model.set_params(**best_params)

In [None]:
validate('lgbm', *params)

In [None]:
print(len(X_test))
model.fit(X_test, y_test, categorical_feature=cat_cols)
make_sabmission(model, num_cols, 'comb_lgbm.csv', transformer=None)

## Functions

In [None]:
def category_errors(cats, cat_col, X, y, model, months, train_size):
    cv = TimeSeriesSplitter(X,months, 0, min_train_size=train_size).split()
    cats_errors = [{},{},{}]
    for i, col in enumerate(cat_col):
            for cat in cats[i]:
                cats_errors[i][cat] = []
    for X_train_ind, X_test_ind in cv:
        X_train, y_train, X_test, y_test = X[X_train_ind], y[X_train_ind], X[X_test_ind], y[X_test_ind]
        X_train_ind, X_test_ind = 0, 0
        print(X_test[0,0])
        model.fit(X_train, y_train, categorical_feature=cat_cols)
        pr = model.predict(X_test)
        pr = np.array([0 if x < 0 else x for x in pr ])
        for i, col in enumerate(cat_col):
            for cat in cats[i]:
                mask = X_test[:,cat_col[i]] == cat
                y_test_cat = y_test[mask]
                pred_cat = pr[mask]
                cats_errors[i][cat].append(root_mean_squared_error(y_test_cat, pred_cat))
    return cats_errors

In [None]:
class CategoryDecoder():
    def __init__(self,cats_path, sub_cats_path, cities_path):
        self.cats = pd.read_csv(cats_path, index_col='index')['0'].to_dict()
        self.sub_cats = pd.read_csv(sub_cats_path, index_col='index')['0'].to_dict()
        self.cities = pd.read_csv(cities_path,index_col='index')['0'].to_dict()
    def _decode(self, col, index):
        if col == 'city':
            return self.cities[index]
        if col == 'cat':
            return self.cats[index]
        if col == 'sub_cat':
            return self.sub_cats[index]
        else:
             raise ArgumentException()
    
    def decode(self, d, col):
        res = []
        for x in d.keys():
            res.append(self._decode(col, int(x)))
        return res