In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm_notebook as tqdm
from AutoRegression import AutoRegression
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA, ARMA
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from ARIMA import myARIMA

from sklearn.preprocessing import PolynomialFeatures

In [2]:
def smape(satellite_predicted_values, satellite_true_values): 
    # the division, addition and subtraction are pointwise 
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values) 
        / (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))


def delete_duplicates(df, eps=10):
    """
        Returns df without "duplicates" - objects within each sat_id,
        which were recorded at almost same time
    """
    
    for sat_id in df['sat_id'].unique():
        d_t = df[df['sat_id'] == sat_id].epoch.apply(lambda x: time.mktime(datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f').timetuple())).values
        df = df.drop(index=df[df['sat_id'] == sat_id].index[0] + np.where((np.roll(d_t, -1)[:-1] - d_t[:-1]) < eps)[0])
    return df

In [3]:
PATH_TO_DATA = os.path.join('data/')
os.listdir(PATH_TO_DATA)


full_train = pd.read_csv(PATH_TO_DATA + 'train.csv', index_col='id')
# pred_sgp = pd.read_csv(PATH_TO_DATA + "pred_sgp.csv", index_col="id")
full_test = pd.read_csv(PATH_TO_DATA + 'test.csv', index_col='id')

## delete duplicates from train and test
full_train = delete_duplicates(full_train)
full_test_wout_dup = delete_duplicates(full_test.copy())

In [19]:
from catboost import CatBoostRegressor

# predict for test dataset without duplicates:
target_columns = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']
for col in target_columns:
    full_test_wout_dup[col] = 0.0

for sat_id in tqdm(full_test_wout_dup.sat_id.unique()):
    tr = full_train[full_train.sat_id == sat_id]
    te = full_test_wout_dup[full_test.sat_id == sat_id]
    test_satid_mask = full_test_wout_dup.sat_id == sat_id
    for col in target_columns:
        model = CatBoostRegressor(iterations=4,
                      learning_rate=0.1,
                      depth=2, verbose=False)
        model.fit(tr[col + '_sim'].values, tr[col].values)
        full_test_wout_dup.loc[full_test_wout_dup[test_satid_mask].index, col] = model.predict(te[col + '_sim'].values)


## after this we need to predict for deleted objects (duplicates) from test,
 ## so we decided to fill predicts with values of nearest objects in time domain
final_df = pd.concat([full_test, full_test_wout_dup[target_columns]], axis=1)
final_df.fillna(method='ffill', inplace=True)

## saving predictions
final_df[target_columns].to_csv('testing.csv', index_label='id')
## check to prevent submit errors
print(final_df[target_columns].isna().sum())

HBox(children=(IntProgress(value=0, max=600), HTML(value='')))


x     0
y     0
z     0
Vx    0
Vy    0
Vz    0
dtype: int64


In [20]:
test_ans = pd.read_csv(PATH_TO_DATA + 'ans.csv')
pred = pd.read_csv('testing.csv')

In [21]:
smape(test_ans[target_columns], pred[target_columns])

x     0.656801
y     0.658765
z     0.659067
Vx    0.751258
Vy    0.752290
Vz    0.752018
dtype: float64

In [22]:
100 * (1 - np.mean(smape(test_ans[target_columns], pred[target_columns])))

29.496663635551577