# Time Structured Data Analysis. Project #2
### **Authors:** Petrov Stanislav, Telesheva Elina
21.05.2023

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.metrics import make_scorer, mean_absolute_error as MAE

from Model import Machinery, pnl_score
from ChangePointDetection import ChangePointDetector

In [2]:
rates = pd.read_csv("./data/input_rates.csv", index_col=0).values
rates = dict(rates)

date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")
df = pd.read_excel('data/Project 2_2023.xlsx', sheet_name='Data', 
                   parse_dates=['Date'], date_parser=date_parser)
df = df.set_index('Date')
df.index.name = 'Date'

train_dates, test_dates = df[:'2020-12-31'].index, df['2021-01-01':'2021-03-31'].index[:-1]
income, outcome = df["Income"], df["Outcome"]
target = (df["Income"] - df["Outcome"]).shift(-1)[:-1]

pnl_scorer = make_scorer(pnl_score, greater_is_better=True, rates=rates)

change_point_detector = ChangePointDetector()

machine = Machinery(score=pnl_score, scorer=pnl_scorer, k_features=10)
machine.finetune(income[train_dates], outcome[train_dates], target[train_dates])

In [3]:
all_preds = []
all_targets = []
for date in tqdm(test_dates):
#     force_finetune = False
#     for series in [income, outcome]:
#         last_chp_date = change_point_detector.detect_changepoint(series[:date])
#         if last_chp_date:
#             if (date - last_chp_date).days < machine.finetune_every:
#                 force_finetune = True
#                 break
#     if force_finetune:
#         machine.finetune_count = machine.finetune_every
#         machine.finetune(income[:date][:-1], outcome[:date][:-1], target[:date][:-1])

    prediction = machine.predict(income[:date], outcome[:date])
    all_preds.append(prediction)
    all_targets.append(target[date])
#     machine.finetune(income[:date], outcome[:date], target[:date])
    machine.calibrate_model(income[:date], outcome[:date], target[:date])

output = pd.DataFrame([np.array(all_preds).T[0],  np.array(all_targets)]).T
output.columns = ["prediction", "fact"]
output.index = test_dates
output.to_csv("experiment_k10_no_finetune.csv")

In [4]:
MAE(output.prediction, output.fact)

0.27645912119320915


In [5]:
pnl_score(output.prediction, output.fact)

-0.0009424470372840411