# Solution

In [22]:
%matplotlib inline

## Preparation

In [23]:
import pandas as pd

io_params = {
    'parse_dates': ['epoch']
}

train = pd.read_csv('data/train.csv', **io_params)

Remove the duplicates.

In [24]:
import datetime as dt
import tqdm

dtypes = train.dtypes.to_dict()

cols_to_shift = train.columns.difference(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'])

train_sats = []

for sat_id in tqdm.tqdm(train['sat_id'].unique(), position=0):
    
    g = train.query('sat_id == @sat_id').copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in reversed(dups):
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift(-1)
        
    g = g.drop(g[g['x'].isnull()].index)
    g['percent'] = pd.np.arange(1, len(g) + 1) / len(g)
        
    train_sats.append(g)
    
train = pd.concat(train_sats).astype(dtypes)

100%|██████████| 600/600 [00:05<00:00, 117.62it/s]


Merge train and test.

In [25]:
test = pd.read_csv('data/Track 1/test.csv', **io_params)
data = pd.concat((train, test), sort=False)
data['is_train'] = data['x'].notnull()
data = data.sort_values(['sat_id', 'epoch'])
data['is_track_1'] = data['sat_id'].isin(data.query('not is_train')['sat_id'].unique())

Implement SMAPE.

In [26]:
import numpy as np

def smape(y_true, y_pred): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

In [27]:
data['cycle_index'] = data.groupby('sat_id').apply(lambda g: pd.Series(pd.np.arange(len(g)) % 24)).to_numpy()
data['unix'] = data['epoch'].astype(pd.np.int64) // 10 ** 9

## Local validation

In [28]:
from numpy import polynomial
import tqdm
import functools

preds = []

train = data.query('is_train')

for (sat, cycle_index), g in tqdm.tqdm(train.assign(is_fit=train.eval('percent < .6')).groupby(['sat_id', 'cycle_index']), position=0):

    fit = g.query('is_fit')
    val = g.query('not is_fit')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):
        
        poly = pd.np.polyfit(
            x = fit['unix'][-20:], 
            y = fit[var][-20:], 
            deg = 2
        )
        
        pred = (
            functools.reduce(np.add, (coeff * val['unix'] ** i  for i, coeff in enumerate(reversed(poly)))) 
            
        )

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': val['epoch'],
            'y_true': val[var],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [29]:
preds = preds.sort_values(['sat_id', 'epoch'])

In [30]:
smapes = preds.groupby(['sat_id', 'variable']).apply(lambda g: smape(g['y_true'], g['y_pred']))
mean_smape = smapes.mean()
100 * (1 - mean_smape)

97.06443405110346

Save the validation SMAPEs for further comparison and blending with other methods.

In [31]:
smapes.rename('smape').to_csv('results/cr_val_scores.csv', header=True)
!head results/cr_val_scores.csv

sat_id,variable,smape
0,Vx,0.00040918986869140734
0,Vy,0.0028351008084079947
0,Vz,0.0016378676925001016
0,x,0.0016415028743984846
0,y,0.0016552806226684984
0,z,0.0018929941546132146
1,Vx,0.002982962244269659
1,Vy,0.002032868901332287
1,Vz,0.002135296465048417


## Use positions to estimate speeds

In [32]:
truth_val = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['epoch'], columns='variable', values='y_true')).reset_index()
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

variable,sat_id,epoch,Vx,Vy,Vz,x,y,z
0,0,2014-01-19 14:08:39.284999936,-1.526557,0.053883,-3.521702,3368.145826,29827.743875,6898.613352
1,0,2014-01-19 14:55:22.286000128,-1.57412,-1.214126,-3.594427,-1019.0794,28256.233757,-3172.228857
2,0,2014-01-19 15:42:05.286000128,-1.400235,-2.600162,-3.153003,-5248.107781,22903.676903,-12768.012787
3,0,2014-01-19 16:28:48.287000064,-0.966743,-3.74341,-2.119939,-8621.561703,13904.950721,-20284.345625
4,0,2014-01-19 17:15:31.287000064,-0.366273,-4.26288,-0.72,-10508.44928,2512.221109,-24302.541487


In [33]:
speed_preds = preds.copy()

#Get the time difference between two observations in seconds
preds["time_delta"] = preds.groupby('sat_id').epoch.diff().bfill()
preds["time_delta"] = preds['time_delta'].apply(lambda t: t.total_seconds())

for var in ('x', 'y', 'z'):
    speed_preds[f"V{var}"] = preds.groupby('sat_id')[var].diff().shift(-1).ffill() / preds.time_delta

speed_preds.head()

variable,sat_id,epoch,Vx,Vy,Vz,x,y,z
0,0,2014-01-19 14:08:39.284999936,-1.565189,-0.560653,-3.592879,3368.145826,29827.743875,6898.613352
1,0,2014-01-19 14:55:22.286000128,-1.50875,-1.909581,-3.423397,-1019.0794,28256.233757,-3172.228857
2,0,2014-01-19 15:42:05.286000128,-1.203515,-3.210391,-2.681532,-5248.107781,22903.676903,-12768.012787
3,0,2014-01-19 16:28:48.287000064,-0.673167,-4.064476,-1.433534,-8621.561703,13904.950721,-20284.345625
4,0,2014-01-19 17:15:31.287000064,-0.064067,-4.224379,-0.024213,-10508.44928,2512.221109,-24302.541487


In [34]:
list_smape = []
for sat, g in speed_preds.groupby('sat_id'):
    truth = truth_val.query('sat_id == @sat')
    for var in ('Vx', 'Vy', 'Vz'):
        list_smape.append(pd.DataFrame({
            'sat_id': [sat],
            'variable': [var],
            'smape': [smape(truth[var].values, g[var].values)]
        }))
list_smape = pd.concat(list_smape)
list_smape.head()

Unnamed: 0,sat_id,variable,smape
0,0,Vx,0.13993
0,0,Vy,0.16322
0,0,Vz,0.153033
0,1,Vx,0.15407
0,1,Vy,0.149159


In [35]:
list_smape.to_csv('results/cr_speed_val_scores.csv', header=True, index=False)
!head results/ar_speed_val_scores.csv

sat_id,variable,smape
0,Vx,0.1405427885947127
0,Vy,0.16408502260307983
0,Vz,0.15547919421757386
1,Vx,0.15328904162060672
1,Vy,0.14969053004583935
1,Vz,0.1665692143367779
2,Vx,0.1543306255096227
2,Vy,0.15838883756807873
2,Vz,0.16928327067205654


## Track 1

Make predictions for the test set.

In [36]:
from numpy import polynomial
import tqdm
import functools

preds = []

data_track_1 = data.query('is_track_1')

for (sat, cycle_index), g in tqdm.tqdm(data_track_1.groupby(['sat_id', 'cycle_index']), position=0):

    train = g.query('is_train')
    
    test = g.query('not is_train')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):
        
        poly = pd.np.polyfit(
            x = train['unix'][-20:], 
            y = train[var][-20:], 
            deg = 2
        )
        
        pred = (
            functools.reduce(np.add, (coeff * test['unix'] ** i  for i, coeff in enumerate(reversed(poly)))) 
            
        )

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': test['epoch'],
            'id': test['id'],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)

100%|██████████| 7200/7200 [01:26<00:00, 83.40it/s]


The predictions are melted, so we unmelt them.

In [37]:
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['id', 'epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

variable,sat_id,id,epoch,Vx,Vy,Vz,x,y,z
0,1,3927,2014-02-01 00:01:45.162,2.614692,-1.303693,1.087024,-24789.05028,-10910.896514,6571.488458
1,1,3928,2014-02-01 00:22:57.007,3.219616,-0.994266,0.895641,-21084.266313,-12384.384855,7840.844138
2,1,3929,2014-02-01 00:44:08.852,3.878331,-0.539306,0.601573,-16576.379677,-13378.903431,8806.632131
3,1,3930,2014-02-01 01:05:20.697,4.583375,0.157665,0.134369,-11198.551494,-13654.939244,9298.664429
4,1,3931,2014-02-01 01:26:32.542,5.251987,1.294445,-0.654109,-4929.284203,-12794.30095,9014.54363


In [38]:
preds = preds.sort_values(['id'])

Take into account the shifts.

In [39]:
correct_preds = []

cols_to_shift = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

for _, g in tqdm.tqdm(preds.groupby('sat_id'), position=0):
    
    g = g.copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in dups:
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift()
    g[cols_to_shift] = g[cols_to_shift].ffill()
    
    correct_preds.append(g)
    
correct_preds = pd.concat(correct_preds)

100%|██████████| 300/300 [00:01<00:00, 243.36it/s]


Save the predictions for track 1. 

In [40]:
correct_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('results/cr_track_1.csv', index=False)
!head -5 results/cr_track_1.csv

id,x,y,z,Vx,Vy,Vz
3927,-24789.05027998984,-10910.896513611078,6571.48845821619,2.6146917225269135,-1.3036927995926817,1.0870237536037166
3928,-21084.266313269734,-12384.384855419397,7840.844138085842,3.2196161400788696,-0.994265988618281,0.8956414519652753
3929,-16576.379676550627,-13378.903430730104,8806.632130682468,3.878330961902975,-0.5393058004374325,0.6015725682773336
3930,-11198.55149435997,-13654.939244061708,9298.664428770542,4.583375454589259,0.15766528075982933,0.1343691758265777


In [41]:
speed_preds = correct_preds.copy()

#Get the time difference between two observations in seconds
correct_preds["time_delta"] = correct_preds.groupby('sat_id').epoch.diff().bfill()
correct_preds["time_delta"] = correct_preds['time_delta'].apply(lambda t: t.total_seconds())

for var in ('x', 'y', 'z'):
    speed_preds[f"V{var}"] = correct_preds.groupby('sat_id')[var].diff().shift(-1).ffill() / correct_preds.time_delta

speed_preds.head()

variable,sat_id,id,epoch,Vx,Vy,Vz,x,y,z
0,1,3927,2014-02-01 00:01:45.162,2.912921,-1.158544,0.998043,-24789.05028,-10910.896514,6571.488458
1,1,3928,2014-02-01 00:22:57.007,3.544368,-0.78195,0.75936,-21084.266313,-12384.384855,7840.844138
2,1,3929,2014-02-01 00:44:08.852,4.228368,-0.217036,0.386865,-16576.379677,-13378.903431,8806.632131
3,1,3930,2014-02-01 01:05:20.697,4.92927,0.676685,-0.223393,-11198.551494,-13654.939244,9298.664429
4,1,3931,2014-02-01 01:26:32.542,5.426827,2.187555,-1.292843,-4929.284203,-12794.30095,9014.54363


In [42]:
speed_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('results/cr_speed_track_1.csv', index=False)
!head -5 results/ar_speed_track_1.csv

id,x,y,z,Vx,Vy,Vz
3927,-24791.216496424422,-10910.678757601636,6570.591142685116,2.9124540854274974,-1.1587281062144,0.9981347743824968
3928,-21087.026330143886,-12384.40130584989,7840.063864809623,3.5438241515280344,-0.7823588968926579,0.7595780636097552
3929,-16579.831302143713,-13379.440557068332,8806.129427121372,4.227794213355123,-0.21741349460546422,0.3871414559495984
3930,-11202.732370859067,-13655.956823114819,9298.513352163589,4.928669237259334,0.6757960556273876,-0.22268368038897257
