In [1]:
PATH_TRAIN = "../../data/train_LPC_RP.csv" 
PATH_TEST = "../../data/test_LPC_RP_updated_sgp4.csv"
PATH_SAVE = "../../data/sub_LPC_RP_data_o_plomo_updated_sgp4.csv"

# Solution

In [2]:
%matplotlib inline

## Preparation

In [3]:
import pandas as pd

io_params = {
    'parse_dates': ['epoch']
}

train = pd.read_csv(PATH_TRAIN, **io_params)

Remove the duplicates.

In [4]:
import datetime as dt
import tqdm

dtypes = train.dtypes.to_dict()

cols_to_shift = train.columns.difference(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'])

train_sats = []

for sat_id in tqdm.tqdm(train['sat_id'].unique(), position=0):
    
    g = train.query('sat_id == @sat_id').copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in reversed(dups):
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift(-1)
        
    g = g.drop(g[g['x'].isnull()].index)
    g['percent'] = pd.np.arange(1, len(g) + 1) / len(g)
        
    train_sats.append(g)
    
train = pd.concat(train_sats).astype(dtypes)

100%|██████████| 225/225 [00:01<00:00, 136.82it/s]


Merge train and test.

In [5]:
test = pd.read_csv(PATH_TEST, **io_params)
data = pd.concat((train, test), sort=False)
data['is_train'] = data['x'].notnull()
data = data.sort_values(['sat_id', 'epoch'])
data['is_track_1'] = data['sat_id'].isin(data.query('not is_train')['sat_id'].unique())

Implement SMAPE.

In [6]:
import numpy as np

def smape(y_true, y_pred): 
    return np.mean(np.abs((y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))))

Implement a generic auto-regressive model.

In [7]:
class ARModel:
    
    def __init__(self, p, model):
        self.p = p
        self.model = model
    
    def fit(self, path):
        
        n = path.strides[0]
        X = np.lib.stride_tricks.as_strided(path, shape=(path.shape[0], self.p), strides=(n, n))[:-self.p]
        Y = path[self.p:]
        
        # Save the most recent history for later usage
        # Conceptually history is a list, but we give it an extra dimension because sklearn eats matrices
        self.history = path[-self.p:].reshape(1, -1)
        
        self.model.fit(X, Y)
        
    def forecast(self, steps):
        
        history = self.history.copy()
        predictions = np.empty(steps)
        
        for i in range(steps):
            
            y_pred = self.model.predict(history)[0]    
            predictions[i] = y_pred
            
            # Shift forward (faster than np.roll)
            history[0, :-1] = history[0, 1:]
            history[0, -1] = y_pred
            
        return predictions

## Local validation

In [8]:
import sklearn
from sklearn import compose
from sklearn import linear_model
from sklearn import pipeline
from sklearn import preprocessing
import tqdm

preds = []


class Pipeline:
    """Barebones implementation with less overhead than sklearn."""
    
    def __init__(self, *steps):
        self.steps = steps
    
    def fit(self, X, y):
        for transformer in self.steps[:-1]:
            X = transformer.fit_transform(X, y)
        self.steps[-1].fit(X, y)
        return self
    
    def predict(self, X):
        for transformer in self.steps[:-1]:
            X = transformer.transform(X)
        return self.steps[-1].predict(X)


class StandardScaler(preprocessing.StandardScaler):
    """Barebones implementation with less overhead than sklearn."""
    
    def transform(self, X):
        return (X - self.mean_) / self.var_ ** .5
    
    
class LinearRegression(linear_model.LinearRegression):
    """Barebones implementation with less overhead than sklearn."""
    
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

    
model = ARModel(
    p=48,
    model=Pipeline(
        StandardScaler(),
        LinearRegression()
    )
)

train = data.query('is_train')

for sat, g in tqdm.tqdm(train.assign(is_fit=train.eval('percent < .6')).groupby('sat_id'), position=0):
    
    fit = g.query('is_fit')
    val = g.query('not is_fit')
    
    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):
        
        model.fit(fit[var].to_numpy())
        pred = model.forecast(len(val))

        preds.append(pd.DataFrame({
            'sat_id': sat,
            'epoch': val['epoch'],
            'y_true': val[var],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)

100%|██████████| 225/225 [00:24<00:00, 13.89it/s]


In [9]:
smapes = preds.groupby(['sat_id', 'variable']).apply(lambda g: smape(g['y_true'], g['y_pred']))
mean_smape = smapes.mean()
100 * (1 - mean_smape)

95.49000249924914

Save the validation SMAPEs for further comparison and blending with other methods.

In [10]:
# smapes.rename('smape').to_csv('results/ar_val_scores.csv', header=True)
# !head results/ar_val_scores.csv

Use estimated positions to predict speed

In [11]:
truth_val = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['epoch'], columns='variable', values='y_true')).reset_index()
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

variable,sat_id,epoch,Vx,Vy,Vz,x,y,z
0,0,2014-01-19 14:21:39.739,-1.803148,6.961875,-0.007593,8117.091074,1505.94912,-8.346646
1,0,2014-01-19 14:27:27.252,-3.752581,6.278008,-0.005178,7147.706091,3824.592547,-10.585737
2,0,2014-01-19 14:33:14.765,-5.446056,4.991029,-0.002184,5538.620135,5799.043349,-11.877244
3,0,2014-01-19 14:39:02.278,-6.66592,3.214951,0.001075,3417.685281,7236.455748,-12.071341
4,0,2014-01-19 14:44:49.791,-7.261348,1.155668,0.004216,978.403977,8000.542636,-11.141814


In [12]:
speed_preds = preds.copy()

#Get the time difference between two observations in seconds
preds["time_delta"] = preds.groupby('sat_id').epoch.diff().bfill()
preds["time_delta"] = preds['time_delta'].apply(lambda t: t.total_seconds())

for var in ('x', 'y', 'z'):
    speed_preds[f"V{var}"] = preds.groupby('sat_id')[var].diff().shift(-1).ffill() / preds.time_delta

speed_preds.head()

variable,sat_id,epoch,Vx,Vy,Vz,x,y,z
0,0,2014-01-19 14:21:39.739,-2.789493,6.672106,-0.006443,8117.091074,1505.94912,-8.346646
1,0,2014-01-19 14:27:27.252,-4.63029,5.68166,-0.003716,7147.706091,3824.592547,-10.585737
2,0,2014-01-19 14:33:14.765,-6.103181,4.136284,-0.000559,5538.620135,5799.043349,-11.877244
3,0,2014-01-19 14:39:02.278,-7.019252,2.198729,0.002675,3417.685281,7236.455748,-12.071341
4,0,2014-01-19 14:44:49.791,-7.279612,0.101019,0.005605,978.403977,8000.542636,-11.141814


In [13]:
list_smape = []
for sat, g in speed_preds.groupby('sat_id'):
    truth = truth_val.query('sat_id == @sat')
    for var in ('Vx', 'Vy', 'Vz'):
        list_smape.append(pd.DataFrame({
            'sat_id': [sat],
            'variable': [var],
            'smape': [smape(truth[var].values, g[var].values)]
        }))
list_smape = pd.concat(list_smape)
list_smape.head()

Unnamed: 0,sat_id,variable,smape
0,0,Vx,0.155504
0,0,Vy,0.156138
0,0,Vz,0.223534
0,1,Vx,0.155355
0,1,Vy,0.155419


In [14]:
# list_smape.to_csv('results/ar_speed_val_scores.csv', header=True, index=False)
# !head results/ar_speed_val_scores.csv

## Track 1

Make predictions for the test set.

In [15]:
preds = []

train_sats = data.query('is_train and is_track_1')
test_sats = data.query('not is_train and is_track_1')

for sat in tqdm.tqdm(test_sats['sat_id'].unique(), position=0):

    train = train_sats.query('sat_id == @sat')
    test = test_sats.query('sat_id == @sat')
    
    for var in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):

        model.fit(train[var].to_numpy())
        pred = model.forecast(len(test))

        preds.append(pd.DataFrame({
            'sat_id': test['sat_id'],
            'id': test['id'],
            'epoch': test['epoch'],
            'y_pred': pred,
            'variable': var
        }))
        
preds = pd.concat(preds)
preds.head()

100%|██████████| 225/225 [01:09<00:00,  3.23it/s]


Unnamed: 0,sat_id,id,epoch,y_pred,variable
0,0,7708,2014-02-01 00:03:49.583,3185.223167,x
1,0,7709,2014-02-01 00:09:37.096,5068.188573,x
2,0,7710,2014-02-01 00:15:24.609,6682.304208,x
3,0,7711,2014-02-01 00:21:12.122,7929.525066,x
4,0,7712,2014-02-01 00:26:59.635,8719.965909,x


In [16]:
len(preds)

2555820

The predictions are melted, so we unmelt them.

In [17]:
preds = preds.groupby('sat_id').apply(lambda g: g.pivot_table(index=['id', 'epoch'], columns='variable', values='y_pred')).reset_index()
preds.head()

variable,sat_id,id,epoch,Vx,Vy,Vz,x,y,z
0,0,7708,2014-02-01 00:03:49.583,5.707164,2.095878,-0.012979,3185.223167,-9207.744328,7.432801
1,0,7709,2014-02-01 00:09:37.096,5.080057,3.420768,-0.013756,5068.188573,-8245.736243,2.767372
2,0,7710,2014-02-01 00:15:24.609,4.162317,4.597993,-0.013815,6682.304208,-6847.304887,-2.044874
3,0,7711,2014-02-01 00:21:12.122,2.972323,5.567031,-0.013102,7929.525066,-5074.098038,-6.744811
4,0,7712,2014-02-01 00:26:59.635,1.539694,6.260491,-0.011573,8719.965909,-3009.968876,-11.056034


Take into account the shifts.

In [18]:
correct_preds = []

cols_to_shift = ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

for _, g in tqdm.tqdm(preds.groupby('sat_id'), position=0):
    
    g = g.copy()
    dups = g[g['epoch'].diff() < dt.timedelta(seconds=60)].index
    
    for i in dups:
        g.loc[i:, cols_to_shift] = g.loc[i:, cols_to_shift].shift()
    g[cols_to_shift] = g[cols_to_shift].ffill()
    
    correct_preds.append(g)
    
correct_preds = pd.concat(correct_preds)

100%|██████████| 225/225 [00:01<00:00, 172.91it/s]


In [19]:
correct_preds.head()

variable,sat_id,id,epoch,Vx,Vy,Vz,x,y,z
0,0,7708,2014-02-01 00:03:49.583,5.707164,2.095878,-0.012979,3185.223167,-9207.744328,7.432801
1,0,7709,2014-02-01 00:09:37.096,5.080057,3.420768,-0.013756,5068.188573,-8245.736243,2.767372
2,0,7710,2014-02-01 00:15:24.609,4.162317,4.597993,-0.013815,6682.304208,-6847.304887,-2.044874
3,0,7711,2014-02-01 00:21:12.122,2.972323,5.567031,-0.013102,7929.525066,-5074.098038,-6.744811
4,0,7712,2014-02-01 00:26:59.635,1.539694,6.260491,-0.011573,8719.965909,-3009.968876,-11.056034


In [20]:
correct_preds.duplicated(['sat_id', 'id']).sum()

0

Save the predictions for track 1. 

In [21]:
correct_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv(PATH_SAVE, index=False)


# ––––––––––––––––––

In [22]:
# speed_preds = correct_preds.copy()

# #Get the time difference between two observations in seconds
# correct_preds["time_delta"] = correct_preds.groupby('sat_id').epoch.diff().bfill()
# correct_preds["time_delta"] = correct_preds['time_delta'].apply(lambda t: t.total_seconds())

# for var in ('x', 'y', 'z'):
#     speed_preds[f"V{var}"] = correct_preds.groupby('sat_id')[var].diff().shift(-1).ffill() / correct_preds.time_delta

# speed_preds.head()

In [23]:
# speed_preds[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv('results/ar_speed_track_1.csv', index=False)
# !head -5 results/ar_speed_track_1.csv

## Track 2

In [24]:
# import copy

# models = {}

# train_sats = data.query('is_train and not is_track_1')  # is_track_2 = not is_track_1 

# for sat, g in tqdm.tqdm(train_sats.groupby('sat_id'), position=0):
    
#     models[sat] = {}

#     for col in ('x', 'y', 'z', 'Vx', 'Vy', 'Vz'):

#         path = g[col].to_numpy()
#         model.fit(path)
#         models[sat][col] = copy.deepcopy(model)

Save the models and the histories.

In [25]:
# import joblib

# joblib.dump(models, 'track_2/ar_models.pkl')
# !du -h track_2/ar_models.pkl