# Автор: Серегин М.С.


# Общее описание

**Основная идея - для каждого ряда использовать лучший алгоритм**


# Загрузка данных

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML
display(HTML("<style>.container { width:45% !important; }</style>"))
import random
import numpy as np
import tqdm
from sklearn.linear_model import Lasso
from statsmodels.tsa.ar_model import AutoReg

In [None]:
tr=pd.read_parquet("train.parquet")
sub=pd.read_csv("submission.csv")

In [None]:
index_split = tr.index.str.split('_')

In [None]:
tr['row_num'] = index_split.str.get(0)
tr['row_type'] = index_split.str.get(1)
tr['timestamp'] = index_split.str.get(-1).astype(int)
tr['row_id'] = tr.row_num + '_' + tr.row_type # "уникальный" ряд - уникальная пара (row_num, row_type)
tr

In [None]:
X = tr.pivot(index='row_id', columns='timestamp', values='Values')

In [None]:
X.columns = ['tr_' + str(col) for col in X.columns]

In [None]:
# https://www.kaggle.com/competitions/aim-2023/code
def calculate_weights(data):
    """
    Функция для расчета весов для каждого ряда
    """
    weights = []
    for i in range(data.shape[0]):
        ts = data.values[i, :]
        weight = np.mean(np.abs(np.diff(ts)))
        weights.append(1 / weight)
    return np.array(weights)


def calculate_mase(y_true, y_pred, weight):
    """
    Функция для расчета метрики MASE для каждого ряда
    """
    n = len(y_true)
    mase = np.sum(weight * np.abs(y_true - y_pred)) / n
    return mase

In [None]:
weights = calculate_weights(X)

In [None]:
# формируем валидационный и обучающий датасеты
X_test = X[X.columns[-100:]]
X_train = X[X.columns[:-100]]

# Первая модель - AutoRegression

1. Тренируем модель на валидационной выборке, вычисляем ошибку для каждого ряда
2. Для всех рядов осуществляем предсказания
3. Период установил в один день (288)



In [None]:
dct = {}
dct_preds = {}

In [None]:
# train
index_all = X_train.index
for i in tqdm.tqdm(range(len(X))):
    vals = X_train.iloc[i].values[-15000:]
    res = AutoReg(vals, lags=300,seasonal=True,period=288).fit()
    preds = res.model.predict(res.params, start=len(vals),end=len(vals)+99)
    

    true = X_test.iloc[i].values
    
    dct[i] = calculate_mase(true,preds,weight=weights[i])

    dct_preds[i] = preds

In [None]:
# test
index_all = X.index
autoreg_preds = {}
for i in tqdm.tqdm(range(len(X))):
    vals = X.iloc[i].values[-15000:]
    res = AutoReg(vals, lags=300,seasonal=True,period=288).fit()
    preds_ = res.model.predict(res.params, start=len(vals),end=len(vals)+99)
    
    autoreg_preds[i] = preds_

In [None]:
# в датасете содержатся ошибки для рядов разных моделей
er_df = pd.DataFrame({"index":dct.keys(),
             "error":dct.values()})

er_df.sort_values("error")

# Вторая модель - Линейная регрессия.

Признаки, учитывающие как непосредственно предшествующие моменту предсказания, так и значения ряда в той же позиции только на 2016 значений назад (ровно неделю,две  итд назад).


In [None]:
BEFORE_ACTION=20

def train_dataset(vals, n_epochs=4, n_surrounders=20, before_action=BEFORE_ACTION, n_estimations=100):
    
    
    def get_features(vals,init,n_estimations=0):
        features = []
        for i in range(init,len(vals)+n_estimations): # for all possible 
            init = i
            single_features = []
            for epoch in range(1,n_epochs+1):
                single_features.extend(vals[init - epoch * 2016 - n_surrounders:init - epoch * 2016 + n_surrounders])
            if n_estimations==0: # if train
                single_features.extend(vals[init-before_action:init])
            features.append(single_features)

        return np.array(features)
    
    train_init = n_epochs * 2016 + n_surrounders
    
    train_features = get_features(vals,train_init)
    test_features = get_features(vals,len(vals),100)
        

    targets = []
    
    for i in range(train_init,len(vals)):
        targets.append(vals[i])
        
    return train_features,test_features, np.array(targets)
    
    
    

In [None]:
def validate(N_EPOCHS,N_SURROUNDERS,BEFORE_ACTION,):
    
    lin_error = {}
    lin_preds = {}
    
    index_all = X_train.index
    n_est = 100
    for j in tqdm.tqdm(range(len(X))):
        vals = X_train.iloc[j].values
        train,test,target = train_dataset(vals,n_epochs=N_EPOCHS,n_surrounders=N_SURROUNDERS,before_action=BEFORE_ACTION)
        preds=[]

        reg = Lasso(max_iter=40000)
        reg.fit(train,target)

        just_before = list(vals)
        for i in range(n_est):
            additional = just_before[len(just_before)-BEFORE_ACTION:]
            single_test = np.concatenate((test[i],np.array(additional)))
            pr=float(reg.predict(single_test.reshape(1,-1)))

            preds.append(pr)
            just_before.append(pr)



        true = X_test.iloc[j].values

        lin_error[j] = calculate_mase(true,preds,weight=weights[j])



        lin_preds[j] = np.array(preds)
        
        
    er_df = pd.DataFrame({"index":lin_error.keys(),
             "error":lin_error.values()})

    er_df.sort_values("error")
    print(er_df.error.mean(),N_EPOCHS,N_SURROUNDERS,BEFORE_ACTION)
    
    return lin_preds,lin_error

        

In [None]:
my_reg,my_reg_error = validate(BEFORE_ACTION=20,
N_EPOCHS=4,
N_SURROUNDERS=15)

In [None]:
def predict(N_EPOCHS,N_SURROUNDERS,BEFORE_ACTION,):
    
    lin_error = {}
    lin_preds = {}
    
    index_all = X.index
    n_est = 100
    for j in tqdm.tqdm(range(len(X))):
        vals = X.iloc[j].values
        train,test,target = train_dataset(vals,n_epochs=N_EPOCHS,n_surrounders=N_SURROUNDERS,before_action=BEFORE_ACTION)
        preds=[]

        reg = Lasso(max_iter=40000)
        reg.fit(train,target)

        just_before = list(vals)
        for i in range(n_est):
            additional = just_before[len(just_before)-BEFORE_ACTION:]
            single_test = np.concatenate((test[i],np.array(additional)))
            pr=float(reg.predict(single_test.reshape(1,-1)))

            preds.append(pr)
            just_before.append(pr)







        lin_preds[j] = np.array(preds)
        

    
    return lin_preds

        

In [None]:
my_reg_predict=predict(BEFORE_ACTION=20,
N_EPOCHS=4,
N_SURROUNDERS=15)

# Наивный алгоритм усреднения значений на предыдущих неделях

In [None]:
def simple(X):
    preds_collect = []
    Preds = pd.DataFrame(index=X.index)

    al = list()
    for i in range(100):
        for j in range(X.shape[1] // 2016):
            preds_collect.append(X.iloc[:, -(2016 * (j + 1)) + i].values)

        al.append(np.median(np.array(preds_collect), axis=0))
        preds_collect = []

    return np.stack(al).T




In [None]:
# validation
res_val = simple(X_train)

In [None]:
naive = []
for i in range(len(X)):
    naive.append(calculate_mase(X_test.values[i],res_val[i],weights[i]))

In [None]:
res=simple(X)

In [None]:
for i in range(len(X)):
    inds=sub[sub.Id.str.startswith(index_all[i])].index
    sub.loc[inds,"Expected"]= res[i]

In [None]:
# добавляем ошибки для наивного алгоритма и линейной регрессии
my_reg_error_lst = list(my_reg_error.values())
er_df["naive"] = naive
er_df["my_reg"] = my_reg_error_lst

In [None]:
# дял каждого ряда определяем наилучший алгоритм

my_better = er_df[(er_df["my_reg"]<er_df["naive"])&(er_df["my_reg"]<er_df["error"])].index

naive_better=er_df[(er_df["naive"]<er_df["my_reg"])&(er_df["naive"]<er_df["error"])].index

error_better = er_df[(er_df["error"]<er_df["my_reg"])&(er_df["error"]<er_df["naive"])].index

In [None]:
#my_better
for i in my_better:
    inds=sub[sub.Id.str.startswith(index_all[i])].index
    sub.loc[inds,"Expected"]=my_reg_predict[i]

In [None]:
#error_better
for i in error_better:
    inds=sub[sub.Id.str.startswith(index_all[i])].index
    sub.loc[inds,"Expected"]=autoreg_preds[i]

In [None]:
sub.to_csv("submissionKaggle.csv",index=0)