In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import catboost
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics
from sklearn.model_selection import KFold
import pickle

pd.options.display.max_columns = 100

In [2]:
train_month_df = pd.read_excel('data/train.xlsx', sheet_name='Monthly').drop(0)
train_quart_df = pd.read_excel('data/train.xlsx', sheet_name='Quarterly').drop(0)

In [3]:
train_month_df['month'] = [int(x[-2:]) for x in train_month_df['Unnamed: 0'].tolist()]
train_month_df['year'] = [int(x[:4]) for x in train_month_df['Unnamed: 0'].tolist()]

train_quart_df['month'] = [int(x[-2:]) for x in train_quart_df['Unnamed: 0'].tolist()]
train_quart_df['year'] = [int(x[:4]) for x in train_quart_df['Unnamed: 0'].tolist()]

train_month_df['year'] -= train_month_df['year'].min()
train_quart_df['year'] -= train_quart_df['year'].min()

train_month_df['month_num'] = train_month_df['month'] + train_month_df['year'] * 12
train_quart_df['month_num'] = train_quart_df['month'] + train_quart_df['year'] * 12

train_month_df.drop('Unnamed: 0', axis=1, inplace=True)
train_quart_df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
from utils import classes

In [5]:
for cl in classes:
    for x in cl:
        train_month_df[x] = (train_month_df[x].values - np.mean(train_month_df[x])) / np.std(train_month_df[x])

In [6]:
from utils import make_features_month_reg

def get_features(values):
    values = np.array(values)
    
    features, target = [], []

    for i in range(37, len(values)):
        features.append(make_features_month_reg(values[:i]))
        target.append(values[i]-values[i-1])

    return features, target

In [7]:
models = []

for id in range(len(classes)):

    all_features, all_target = [], []
    
    for col in classes[id]:
        a = train_month_df[col].dropna().tolist()


        features, target = get_features(a)
        all_features.append(features)
        all_target.append(target)
    
    all_features = np.concatenate(all_features, axis=0)
    all_target = np.concatenate(all_target, axis=0)
    
    
    cur_models = []
    for x in range(5):
        model = catboost.CatBoostRegressor(
            random_seed=x,
            learning_rate=0.03
        )
        model.fit(all_features, all_target, verbose=100)

        cur_models.append(model)

    models.append(cur_models)

with open('models_month_reg.pkl', 'wb') as f:
    pickle.dump(models, f)

0:	learn: 0.5164131	total: 63.5ms	remaining: 1m 3s
100:	learn: 0.3136487	total: 344ms	remaining: 3.06s
200:	learn: 0.2762444	total: 688ms	remaining: 2.74s
300:	learn: 0.2491797	total: 1s	remaining: 2.33s
400:	learn: 0.2272320	total: 1.27s	remaining: 1.9s
500:	learn: 0.2075945	total: 1.61s	remaining: 1.6s
600:	learn: 0.1913329	total: 1.89s	remaining: 1.25s
700:	learn: 0.1742203	total: 2.17s	remaining: 927ms
800:	learn: 0.1584464	total: 2.46s	remaining: 612ms
900:	learn: 0.1447856	total: 2.8s	remaining: 307ms
999:	learn: 0.1329301	total: 3.17s	remaining: 0us
0:	learn: 0.5162454	total: 6.56ms	remaining: 6.55s
100:	learn: 0.3137189	total: 409ms	remaining: 3.64s
200:	learn: 0.2770139	total: 819ms	remaining: 3.25s
300:	learn: 0.2495301	total: 1.26s	remaining: 2.93s
400:	learn: 0.2264388	total: 1.68s	remaining: 2.51s
500:	learn: 0.2079383	total: 2.12s	remaining: 2.11s
600:	learn: 0.1890310	total: 2.54s	remaining: 1.68s
700:	learn: 0.1718090	total: 2.92s	remaining: 1.24s
800:	learn: 0.1572078	