# Solution

In [1]:
%matplotlib inline

## Preparation

In [2]:
import pandas as pd

io_params = {
    'parse_dates': ['epoch']
}

train = pd.read_csv('data/train.csv', **io_params)

Remove the duplicates.

In [3]:
import datetime as dt
import tqdm

dtypes = train.dtypes.to_dict()

cols_to_shift = train.columns.difference(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'])

train_sats = []

for sat_id in tqdm.tqdm(train['sat_id'].unique(), position=0):
    
    g = train.query('sat_id == @sat_id').copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in reversed(dups):
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift(-1)
        
    g = g.drop(g[g['x'].isnull()].index)
    g['percent'] = pd.np.arange(1, len(g) + 1) / len(g)
        
    train_sats.append(g)
    
train = pd.concat(train_sats).astype(dtypes)

100%|██████████| 600/600 [00:17<00:00, 35.14it/s]


Merge train and test.

In [4]:
test = pd.read_csv('data/Track 1/test.csv', **io_params)
data = pd.concat((train, test), sort=False)
data['is_train'] = data['x'].notnull()
data = data.sort_values(['sat_id', 'epoch'])
data['is_track_1'] = data['sat_id'].isin(data.query('not is_train')['sat_id'].unique())

Implement SMAPE.

In [5]:
import numpy as np

def smape(y_true, y_pred): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

Implement a generic auto-regressive model.

In [6]:
class ARModel:
    
    def __init__(self, p, model):
        self.p = p
        self.model = model
    
    def fit(self, path):
        
        n = path.strides[0]
        X = np.lib.stride_tricks.as_strided(path, shape=(path.shape[0], self.p), strides=(n, n))[:-self.p]
        Y = path[self.p:]
        
        # Save the most recent history for later usage
        # Conceptually history is a list, but we give it an extra dimension because sklearn eats matrices
        self.history = path[-self.p:].reshape(1, -1)
        
        self.model.fit(X, Y)
        
    def forecast(self, steps):
        
        history = self.history.copy()
        predictions = np.empty(steps)
        
        for i in range(steps):
            
            y_pred = self.model.predict(history)[0]    
            predictions[i] = y_pred
            
            # Shift forward (faster than np.roll)
            history[0, :-1] = history[0, 1:]
            history[0, -1] = y_pred
            
        return predictions

## Local validation

In [7]:
import sklearn
from sklearn import compose
from sklearn import linear_model
from sklearn import pipeline
from sklearn import preprocessing
import tqdm

preds = []


class Pipeline:
    """Barebones implementation with less overhead than sklearn."""
    
    def __init__(self, *steps):
        self.steps = steps
    
    def fit(self, X, y):
        for transformer in self.steps[:-1]:
            X = transformer.fit_transform(X, y)
        self.steps[-1].fit(X, y)
        return self
    
    def predict(self, X):
        for transformer in self.steps[:-1]:
            X = transformer.transform(X)
        return self.steps[-1].predict(X)


class StandardScaler(preprocessing.StandardScaler):
    """Barebones implementation with less overhead than sklearn."""
    
    def transform(self, X):
        return (X - self.mean_) / self.var_ ** .5
    
    
class LinearRegression(linear_model.LinearRegression):
    """Barebones implementation with less overhead than sklearn."""
    
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

    
model = ARModel(
    p=48,
    model=Pipeline(
        StandardScaler(),
        LinearRegression()
    )
)

train = data.query('is_train')

for sat, g in tqdm.tqdm(train.assign(is_fit=train.eval('percent < .6')).groupby('sat_id'), position=0):

    fit = g.query('is_fit')
    val = g.query('not is_fit')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):

        model.fit(fit[var].to_numpy())
        pred = model.forecast(len(val))

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': val['epoch'],
            'y_true': val[var],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)

100%|██████████| 600/600 [00:40<00:00, 14.99it/s]


In [8]:
smapes = preds.groupby(['sat_id', 'variable']).apply(lambda g: smape(g['y_true'], g['y_pred']))
mean_smape = smapes.mean()
100 * (1 - mean_smape)

97.57179531257634

Save the validation SMAPEs for further comparison and blending with other methods.

In [9]:
smapes.rename('smape').to_csv('results/ar_val_scores.csv', header=True)
!head results/ar_val_scores.csv

sat_id,variable,smape
0,Vx,0.000597070177374541
0,Vy,0.00036367390081294725
0,Vz,0.0001728272296867911
0,x,0.0017613854434870993
0,y,0.00025717133886025544
0,z,0.0006687014398263775
1,Vx,0.0031092783186345996
1,Vy,0.0009305197946845225
1,Vz,0.003728780888035956


## Track 1

Make predictions for the test set.

In [10]:
preds = []

train_sats = data.query('is_train and is_track_1')
test_sats = data.query('not is_train and is_track_1')

for sat in tqdm.tqdm(test_sats['sat_id'].unique(), position=0):

    train = train_sats.query('sat_id == @sat')
    test = test_sats.query('sat_id == @sat')

    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):

        model.fit(train[var].to_numpy())
        pred = model.forecast(len(test))

        preds.append(pd.DataFrame({
            'sat_id': test['sat_id'],
            'id': test['id'],
            'epoch': test['epoch'],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)
preds.head()

100%|██████████| 300/300 [00:34<00:00,  8.65it/s]


Unnamed: 0,sat_id,id,epoch,y_pred,variable
0,1,3927,2014-02-01 00:01:45.162,-24791.216496,x
1,1,3928,2014-02-01 00:22:57.007,-21087.02633,x
2,1,3929,2014-02-01 00:44:08.852,-16579.831302,x
3,1,3930,2014-02-01 01:05:20.697,-11202.732371,x
4,1,3931,2014-02-01 01:26:32.542,-4934.229045,x


The predictions are melted, so we unmelt them.

In [11]:
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['id', 'epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

variable,sat_id,id,epoch,Vx,Vy,Vz,x,y,z
0,1,3927,2014-02-01 00:01:45.162,2.61425,-1.303894,1.087091,-24791.216496,-10910.678758,6570.591143
1,1,3928,2014-02-01 00:22:57.007,3.219142,-0.994539,0.89578,-21087.02633,-12384.401306,7840.063865
2,1,3929,2014-02-01 00:44:08.852,3.87777,-0.539693,0.601826,-16579.831302,-13379.440557,8806.129427
3,1,3930,2014-02-01 01:05:20.697,4.582811,0.157078,0.134822,-11202.732371,-13655.956823,9298.513352
4,1,3931,2014-02-01 01:26:32.542,5.251436,1.293476,-0.653268,-4934.229045,-12796.448989,9015.294227


Take into account the shifts.

In [12]:
correct_preds = []

cols_to_shift = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

for _, g in tqdm.tqdm(preds.groupby('sat_id'), position=0):
    
    g = g.copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in dups:
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift()
    g[cols_to_shift] = g[cols_to_shift].ffill()
    
    correct_preds.append(g)
    
correct_preds = pd.concat(correct_preds)

100%|██████████| 300/300 [00:02<00:00, 107.25it/s]


Save the predictions for track 1. 

In [13]:
correct_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('results/ar_track_1.csv', index=False)
!head -5 results/ar_track_1.csv

id,x,y,z,Vx,Vy,Vz
3927,-24791.2164964244,-10910.678757601678,6570.591142685114,2.6142500911099793,-1.3038936610685328,1.087090864077235
3928,-21087.026330143868,-12384.401305849937,7840.063864809641,3.21914249547442,-0.9945388414207295,0.8957800688453443
3929,-16579.83130214371,-13379.440557068334,8806.129427121368,3.8777700588719823,-0.5396932773126031,0.6018263508983139
3930,-11202.732370859037,-13655.956823114855,9298.513352163576,4.5828110631897845,0.1570784377916751,0.13482178502176748


## Track 2

In [14]:
import copy

models = {}

train_sats = data.query('is_train and not is_track_1')  # is_track_2 = not is_track_1 

for sat, g in tqdm.tqdm(train_sats.groupby('sat_id'), position=0):
    
    models[sat] = {}

    for col in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):

        path = g[col].to_numpy()
        model.fit(path)
        models[sat][col] = copy.deepcopy(model)

100%|██████████| 300/300 [00:06<00:00, 48.08it/s]


Save the models and the histories.

In [15]:
import joblib

joblib.dump(models, 'track_2/ar_models.pkl')
!du -h track_2/ar_models.pkl

4.8M	track_2/ar_models.pkl
