# Solution

In [1]:
%matplotlib inline

## Preparation

In [2]:
import pandas as pd

io_params = {
    'parse_dates': ['epoch']
}

train = pd.read_csv('data/train.csv', **io_params)

Remove the duplicates.

In [3]:
import datetime as dt
import tqdm

dtypes = train.dtypes.to_dict()

cols_to_shift = train.columns.difference(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'])

train_sats = []

for sat_id in tqdm.tqdm(train['sat_id'].unique(), position=0):
    
    g = train.query('sat_id == @sat_id').copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in reversed(dups):
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift(-1)
        
    g = g.drop(g[g['x'].isnull()].index)
    g['percent'] = pd.np.arange(1, len(g) + 1) / len(g)
        
    train_sats.append(g)
    
train = pd.concat(train_sats).astype(dtypes)

100%|██████████| 600/600 [00:10<00:00, 56.62it/s]


Merge train and test.

In [4]:
test = pd.read_csv('data/Track 1/test.csv', **io_params)
data = pd.concat((train, test), sort=False)
data['is_train'] = data['x'].notnull()
data = data.sort_values(['sat_id', 'epoch'])
data['is_track_1'] = data['sat_id'].isin(data.query('not is_train')['sat_id'].unique())

Implement SMAPE.

In [5]:
import numpy as np

def smape(y_true, y_pred): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

In [6]:
data['cycle_index'] = data.groupby('sat_id').apply(lambda g: pd.Series(pd.np.arange(len(g)) % 24)).to_numpy()
data['unix'] = data['epoch'].astype(pd.np.int64) // 10 ** 9

## Local validation

In [7]:
from numpy import polynomial
import tqdm
import functools

preds = []

train = data.query('is_train')

for (sat, cycle_index), g in tqdm.tqdm(train.assign(is_fit=train.eval('percent < .6')).groupby(['sat_id', 'cycle_index']), position=0):

    fit = g.query('is_fit')
    val = g.query('not is_fit')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):
        
        poly = pd.np.polyfit(
            x = fit['unix'][-20:], 
            y = fit[var][-20:], 
            deg = 2
        )
        
        pred = (
            functools.reduce(np.add, (coeff * val['unix'] ** i  for i, coeff in enumerate(reversed(poly)))) 
            
        )

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': val['epoch'],
            'y_true': val[var],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)









100%|██████████| 14400/14400 [08:04<00:00, 29.69it/s]


In [8]:
preds = preds.sort_values(['sat_id', 'epoch'])

In [9]:
smapes = preds.groupby(['sat_id', 'variable']).apply(lambda g: smape(g['y_true'], g['y_pred']))
mean_smape = smapes.mean()
100 * (1 - mean_smape)

97.06443405110346

Save the validation SMAPEs for further comparison and blending with other methods.

In [10]:
smapes.rename('smape').to_csv('results/cr_val_scores.csv', header=True)
!head results/cr_val_scores.csv

sat_id,variable,smape
0,Vx,0.0004091898686914077
0,Vy,0.002835100808407995
0,Vz,0.0016378676925001023
0,x,0.0016415028743984842
0,y,0.0016552806226684978
0,z,0.001892994154613214
1,Vx,0.0029829622442696557
1,Vy,0.002032868901332287
1,Vz,0.002135296465048415


## Track 1

Make predictions for the test set.

In [None]:
from numpy import polynomial
import tqdm
import functools

preds = []

data_track_1 = data.query('is_track_1')

for (sat, cycle_index), g in tqdm.tqdm(data_track_1.groupby(['sat_id', 'cycle_index']), position=0):

    train = g.query('is_train')
    
    test = g.query('not is_train')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):
        
        poly = pd.np.polyfit(
            x = train['unix'][-20:], 
            y = train[var][-20:], 
            deg = 2
        )
        
        pred = (
            functools.reduce(np.add, (coeff * test['unix'] ** i  for i, coeff in enumerate(reversed(poly)))) 
            
        )

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': test['epoch'],
            'id': test['id'],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)

100%|██████████| 7200/7200 [03:10<00:00, 37.71it/s]


The predictions are melted, so we unmelt them.

In [None]:
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['id', 'epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

In [None]:
preds = preds.sort_values(['id'])

Take into account the shifts.

In [None]:
correct_preds = []

cols_to_shift = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

for _, g in tqdm.tqdm(preds.groupby('sat_id'), position=0):
    
    g = g.copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in dups:
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift()
    g[cols_to_shift] = g[cols_to_shift].ffill()
    
    correct_preds.append(g)
    
correct_preds = pd.concat(correct_preds)

Save the predictions for track 1. 

In [None]:
correct_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('results/cr_track_1.csv', index=False)
!head -5 results/cr_track_1.csv

## Track 2

#### Work in progress