In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm_notebook as tqdm
from AutoRegression import AutoRegression
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA, ARMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from ARIMA import myARIMA

from sklearn.preprocessing import PolynomialFeatures

In [2]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))


def delete_duplicates(df, eps=10):
    """
        Returns df without "duplicates" - objects within each sat_id,
        which were recorded at almost same time
    """
    
    for sat_id in df['sat_id'].unique():
        d_t = df[df['sat_id'] == sat_id].epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
        df = df.drop(index=df[df['sat_id'] == sat_id].index[0] + np.where((np.roll(d_t, -1)[:-1] - d_t[:-1]) < eps)[0])
    return df

In [16]:
from metrics import smape_idao, idao_score, smape_new_vector_norm

def print_metrics(model_name, pred):
    print(f"\n{model_name}\n")
    print(f"IDAO score:  {idao_score(pred[target_columns], df_test_ans[target_columns])}")    
    print(f"SMAPE IDAO:  {smape_idao(pred[target_columns], df_test_ans[target_columns])}")    
    print(f"SMAPE new:   {smape_new_vector_norm(pred, df_test_ans)}")    


# IDAO DATA

In [17]:
df_train = pd.read_csv("data/train.csv", index_col="id")
df_test = pd.read_csv("data/test.csv", index_col="id")
df_test_ans = pd.read_csv("data/ans.csv", index_col="id")
df_test_ans['sat_id'] = df_test['sat_id']

n_sat = len(pd.unique(df_train["sat_id"]))

df_train = delete_duplicates(df_train)
full_test_wout_dup = delete_duplicates(df_test.copy())

In [23]:
width_list = [48,  24]  ## widths of window, which will be used for prediction
coefs_for_ensemble = [0.8, 0.2] ## coefs for ensemblin models


# predict for test dataset without duplicates:
target_columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
for col in target_columns:
    full_test_wout_dup[col] = 0.0

for sat_id in tqdm(full_test_wout_dup.sat_id.unique()):
    for width, coef in zip(width_list, coefs_for_ensemble):
        df = df_train[df_train.sat_id == sat_id]
        test_satid_mask = full_test_wout_dup.sat_id == sat_id
        for col in target_columns:
            model = AutoRegression(width=width)
            model.fit(df[col].values)
            full_test_wout_dup.loc[full_test_wout_dup[test_satid_mask].index, col] +=\
                coef * model.predict(len(full_test_wout_dup[test_satid_mask]))


## after this we need to predict for deleted objects (duplicates) from test,
 ## so we decided to fill predicts with values of nearest objects in time domain
final_df = pd.concat([df_test, full_test_wout_dup[target_columns]], axis=1)
final_df.fillna(method='ffill', inplace=True)

# for column in target_columns:
#     final_df[column] += full_test[column + "_sim"].values
## saving predictions
#final_df[target_columns].to_csv('testing1.csv', index_label='id')
## check to prevent submit errors
print(final_df[target_columns].isna().sum())

HBox(children=(IntProgress(value=0, max=600), HTML(value='')))


x     0
y     0
z     0
Vx    0
Vy    0
Vz    0
dtype: int64


In [24]:
print_metrics('Ensembled AR', final_df)


Ensembled AR

IDAO score:  97.3468792352761
SMAPE IDAO:  0.02653120764723891
SMAPE new:   0.05270116348768325


In [19]:
print_metrics('Ensembled AR', final_df)


Ensembled AR

IDAO score:  97.21670572063574
SMAPE IDAO:  0.02783294279364254
SMAPE new:   0.056036577373006685


# LPC_RP DATA

In [26]:
df_train = pd.read_csv("data/train_LPC_RP.csv", index_col="id")
df_test = pd.read_csv("data/test_LPC_RP.csv", index_col="id")
df_test_ans = pd.read_csv("data/ans_LPC_RP.csv", index_col="id")
df_test_ans['sat_id'] = df_test['sat_id']

n_sat = len(pd.unique(df_train["sat_id"]))

df_train = delete_duplicates(df_train)
full_test_wout_dup = delete_duplicates(df_test.copy())

In [27]:
width_list = [48,  24]  ## widths of window, which will be used for prediction
coefs_for_ensemble = [0.8, 0.2] ## coefs for ensemblin models


# predict for test dataset without duplicates:
target_columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
for col in target_columns:
    full_test_wout_dup[col] = 0.0

for sat_id in tqdm(full_test_wout_dup.sat_id.unique()):
    for width, coef in zip(width_list, coefs_for_ensemble):
        df = df_train[df_train.sat_id == sat_id]
        test_satid_mask = full_test_wout_dup.sat_id == sat_id
        for col in target_columns:
            model = AutoRegression(width=width)
            model.fit(df[col].values)
            full_test_wout_dup.loc[full_test_wout_dup[test_satid_mask].index, col] +=\
                coef * model.predict(len(full_test_wout_dup[test_satid_mask]))


## after this we need to predict for deleted objects (duplicates) from test,
 ## so we decided to fill predicts with values of nearest objects in time domain
final_df = pd.concat([df_test, full_test_wout_dup[target_columns]], axis=1)
final_df.fillna(method='ffill', inplace=True)

# for column in target_columns:
#     final_df[column] += full_test[column + "_sim"].values
## saving predictions
#final_df[target_columns].to_csv('testing1.csv', index_label='id')
## check to prevent submit errors
print(final_df[target_columns].isna().sum())

HBox(children=(IntProgress(value=0, max=225), HTML(value='')))


x     0
y     0
z     0
Vx    0
Vy    0
Vz    0
dtype: int64


In [28]:
print_metrics('Ensembled AR', final_df)


Ensembled AR

IDAO score:  84.33080112870671
SMAPE IDAO:  0.15669198871293286
SMAPE new:   0.11743396965482611
