In [138]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.interpolate import splrep, splev
import datetime as dt
import matplotlib.pyplot as plt
from fbprophet import Prophet
from scipy.interpolate import InterpolatedUnivariateSpline
%matplotlib inline
plt.rcParams.update({'font.size': 18})
import utils
import json
from LinearAlignment import LinearAlignment
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn_pandas import DataFrameMapper
from lightgbm import LGBMRegressor
from numpy import random


Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.3) compiler.
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.



In [2]:
def smape(satellite_predicted_values, satellite_true_values):
    # the division, addition and subtraction are pointwise

    return np.mean(np.abs((satellite_predicted_values - satellite_true_values)/
                (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))


def drop_close(t, x, eps=10**9):
    '''
    t = time array, x = data array, eps is in nanoseconds
    Returns entries in t,x with corresponding consecutive times > eps
    
    '''
    t = np.array(t) #if not already np array, convert
    x = np.array(x) #if t or x are pandas Series, will have dimension mismatch
    far = np.concatenate([(t[1:] - t[:-1]) > eps, [True]])
    return t[far], x[far]


def resample(t, x, step=10 * 10**9, t_new=None, return_t=False):
    '''
    t: time array (or series); 
    x: data array (or series); 
    t_new: new time scale from start to end of t with step size step;
    step: = 10 seconds by default; 
    return_t: by default, do not return resampled times
    
    resample time series or array by 10 (default) sec intervals and 
    return new time series (if t_new=True) and spline approximation series for data
    
    
    '''
    t, x = drop_close(t, x)
    if t_new is None:
        t_new = np.arange(t[0], t[-1], step)
    try:
        spl = splrep(t, x)
        x_new = splev(t_new, spl)
    except:
        raise ValueError(f'interpolation error, x length = {len(x)}, \
        t_new length = {len(t_new)}')

    return (t_new, x_new) if return_t else x_new


def get_peaks(array):
    '''
    returns index of "sharp" peaks, excluding first and last values of array
    
    index of "smooth peaks", e.g. 1 2 3 9 9 3 2 1, is not returned
    '''
    return np.where((array[1:-1] > array[2:]) & (array[1:-1] > array[:-2]))[0] + 1


def get_satellite_data(data, sat_id):
    '''
    returns all data for particular satellite by id
    '''
    return data[data['sat_id'] == sat_id]

In [3]:
def insert_previous_and_shift(df,col_name,ind):
    '''
    input a data frame (df), column name (col_name), and index (ind)
    insert previous value of df[col_name] at index and shift the rest 
    of df[col_name] from ind by +1;
    This is used for remove_time_jumps_fast
    '''
    shifted_series = df[col_name].shift(1)
    df[col_name].iloc[ind] = df[col_name].iloc[ind-1]
    df[col_name].iloc[ind+1:] = shifted_series.iloc[ind+1:]
    return df

In [4]:
def remove_time_jumps_fast(data, features_list=
                           ('x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'),
                           threshold = 0.000001):
    #time_threshold 0.00003 sufficient for test and train
    #time_threshold 0.00002 will throw errors
    '''
    removes time jumps in the simulation for a single satellite
    for train and test data, sufficient to set time_threshold at default
    s_data = satellite data
    the features are replaced by the correction
    note that threshold here is not the same as in remove_time_jumps
    '''
    epoch_ind = data.columns.get_loc('epoch')
    data['t'] = ((pd.to_datetime(data['epoch']) - pd.to_datetime(data.iloc[0,epoch_ind])) /
                               np.timedelta64(1, 'D')).astype(float)
    data['dt'] = data['t'].diff(1)

    index_for_correction = data[data['dt'] < threshold].index 
    #print(index_for_correction)
    if list(index_for_correction): #if non empty
        for feature in features_list:
            for i in index_for_correction:
                j = data.index.get_loc(i)
                data = insert_previous_and_shift(data,feature,j)
    return data

In [5]:
def stretch_simulated_feats(data, stretch,true_feats = 'position',
                           suffix="_stretch"):
    '''
    stretch time scale for simulation to better match true data,
    for a single sattelite;
    use predetermined stretch coefficient (depends on the satellite)
    
    '''
    if true_feats == 'position':
        true_feats_list = ['x','y','z']
    elif true_feats == 'velocity':
        true_feats_list = ['Vx','Vy','Vz']
    elif true_feats == 'all':
        true_feats_list = ['x','y','z'] + ['Vx','Vy','Vz']
    else:
        true_feats_list = true_feats
    
    for feature in true_feats_list:
        spl = splrep(stretch*data['t'],data[feature+'_sim'])
        test_stretch = splev(data['t'], spl) #np array
        data[feature+suffix] = test_stretch
    
    return data

def amp_sim_feats(data, amp_stretch,feats = ['Vx_sim','Vy_sim','Vz_sim'],
                           suffix="_stretch_amp"):
    '''
    vary amplitude for simulation to better match true data,
    for a single sattelite;
    use predetermined amp_stretch coefficient (depends on the satellite)
    
    '''
    if feats == 'position':
        feats_list = ['x_sim','y_sim','z_sim']
    elif feats == 'velocity':
        feats_list = ['Vx_sim','Vy_sim','Vz_sim']
    elif feats == 'all':
        feats_list = ['x_sim','y_sim','z_sim'] + ['Vx_sim','Vy_sim','Vz_sim']
    else:
        feats_list = feats
        
    for feature in feats_list:
        data[feature+suffix] = amp_stretch*data[feature]
    
    return data

In [6]:
def train_test_split_X(df,col='sat_id',ratio = 0.8,discard = 0):
    '''
    train test split for our train data only, no targets, 
    default, 80% train for each satellite, 20% test
    '''
    sat_list = df[col].unique()
    X_train = pd.DataFrame([])
    X_test = pd.DataFrame([])
    for sat in sat_list:
        sat_df = get_satellite_data(df,sat)
        m = int(discard*sat_df.shape[0])
        n = int(ratio*sat_df.shape[0])
        X_train = X_train.append(sat_df[m:n])
        X_test = X_test.append(sat_df[n:])
    return X_train, X_test

In [7]:
target_list = ['x','y','z','Vx','Vy','Vz']
feature_list = [t+'_sim' for t in target_list]

Load data

In [114]:
train = pd.read_csv('data/train.csv', index_col = 'id')
train['time'] = train['epoch']
train['epoch'] = pd.to_datetime(train['epoch']).values.astype(float)
test = pd.read_csv('data/test.csv', index_col = 'id')
test['time'] = test['epoch']
test['epoch'] = pd.to_datetime(test['epoch']).values.astype(float)

In [115]:
print('train:',train.shape)

train: (649912, 15)


Track 1 data

In [10]:
sat_t1 = test['sat_id'].unique()
train_t1 = train[train['sat_id'].isin(sat_t1)]
train_test = pd.concat([train,test],sort=False)

Load transformed data, from train (January) and test (February) data sets

In [46]:
train_tf = pd.read_csv('train_transformed.csv')
train_tf.drop('id',axis=1,inplace=True)
train_tf.columns

Index(['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim', 'sat_id',
       'epoch'],
      dtype='object')

In [89]:
test_tf = pd.read_csv('submission_2020-02-03_12-12-21.csv',index_col='id')
test_epoch = pd.read_csv('data/test.csv',index_col='id')['epoch']
test_sat_id = pd.read_csv('data/test.csv',index_col='id')['sat_id']
test_tf = pd.concat([test_sat_id,test_tf,test_epoch],axis=1)
test_tf['epoch'] = pd.to_datetime(test_tf['epoch']).values.astype(float)

In [90]:
rename_dict = {c:c_n for (c,c_n) in list(zip(target_list,feature_list))}
test_tf = test_tf.rename(rename_dict,axis=1)

In [91]:
start = train.iloc[0,train.columns.get_loc('epoch')]
day = (train['epoch'].max() - train['epoch'].min())/31

Feature engineering

In [92]:
train_tf = train_tf[test_tf.columns]
df = pd.concat([train_tf,test_tf])

In [139]:
pd.options.mode.chained_assignment = None

In [140]:
features = ['epoch', 'sat_id','x_sim', 'y_sim',
       'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim', 'time']

In [141]:
train_tf = train_tf[test_tf.columns]
df = pd.concat([train_tf,test_tf])
df['r'] = df['x_sim']**2 + df['y_sim']**2 + df['z_sim']**2
df['Vr'] = df['Vx_sim']**2 + df['Vy_sim']**2 + df['Vz_sim']**2
df['r_ratio_Vr'] = df['r']/df['Vr']
df['epoch'] = (df['epoch']-start)/day
for col in df.columns:
    if col not in ['sat_id','time']:
        df[col+'_d1'] = df[col].diff(1)
    if col not in ['sat_id','epoch','time']:
        df[col+'_d2'] = df[col].diff(2)
        df[col+'_d3'] = df[col].diff(3)
        df[col+'_d4'] = df[col].diff(4)
        df[col+'_d5'] = df[col].diff(5)
        df[col+'_shift1'] = df[col].shift(1)
        for i in range(2,6):
            df[col+'_shift{}'.format(i)] = df[col+'_shift{}'.format(i-1)]
df = df.fillna(0)

In [142]:
df.shape

(599254, 102)

In [143]:
train_fe = df[:train_t1.shape[0]]
test_fe = df[train_t1.shape[0]:]

In [144]:
test_fe.shape

(284071, 102)

Train test split: Discard first 60% of data, then 50/50

In [145]:
X_train = pd.DataFrame([])
X_test = pd.DataFrame([])
for sat in train_fe['sat_id'].unique():
    train,test = train_test_split_X(get_satellite_data(train_fe,sat),ratio = 0.8,discard=0.6)
    X_train = X_train.append(train)
    X_test = X_test.append(test)

In [146]:
print(X_train.shape,X_test.shape)

(63040, 102) (63158, 102)


In [147]:
train_t1.columns

Index(['epoch', 'sat_id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz', 'x_sim', 'y_sim',
       'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim', 'time'],
      dtype='object')

In [148]:
y_train = pd.DataFrame([])
y_test = pd.DataFrame([])
for sat in train_fe['sat_id'].unique():
    train,test = train_test_split_X(get_satellite_data(train_t1,sat),ratio = 0.8, discard=0.6)
    y_train = y_train.append(train)
    y_test = y_test.append(test)

In [149]:
y_train = y_train[target_list]

In [150]:
print(y_train.shape,y_test.shape)

(63040, 6) (63158, 15)


Random Forest

In [151]:
%%time
rf = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=50)
rf.fit(X_train.fillna(0), y_train['Vx'])

CPU times: user 9min 25s, sys: 2.78 s, total: 9min 27s
Wall time: 2min 53s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=50, verbose=0, warm_start=False)

In [152]:
pred = rf.predict(X_test)
print('smape for RF, Vx:',smape(pred,y_test['Vx']))
print('smape for ground truth and simluation, Vx:',smape(X_test['Vx_sim'],y_test['Vx']))

smape for RF, Vx: 0.07736737939508413
smape for ground truth and simluation, Vx: 0.6774338934855161


In [None]:
dfs_train = [] #training data simulation 
for sat in sat_t1:
    dfs_train.append(get_satellite_data(train_transformed,sat))
dfs_test = [] #test data simulation
for sat in sat_t1:
    dfs_test.append(get_satellite_data(test_transformed,sat))
dfs_target = [] #training data ground truth
for sat in sat_t1: 
    dfs_target.append(get_satellite_data(data[target_list +['sat_id']],sat))