In [1]:
PATH_TRAIN = "../../data/train_LPC_RP.csv" 
PATH_TEST = "../../data/test_LPC_RP.csv"
PATH_SAVE = "../../data/sub_LPC_RP_veni_vidi_vici.csv"

In [2]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn


from scipy import stats
from scipy.stats import norm, skew


pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

In [3]:
train = pd.read_csv(PATH_TRAIN)
test = pd.read_csv(PATH_TEST)

In [4]:
# Parse the dates
train['epoch'] = pd.to_datetime(train['epoch'])
test['epoch'] = pd.to_datetime(test['epoch'])

# # Generate new column with year and month for future aggregations
train['year_month'] = train['epoch'].dt.to_period('M')
test['year_month'] = test['epoch'].dt.to_period('M')

In [5]:
from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        c = math.fabs(a)+math.fabs(b)
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out /= y_true.shape[0]
    return out


def lgbm_smape(preds, train_data):
    '''
    Custom Evaluation Function for LGBM
    '''
    labels = train_data.get_label()
    smape_val = smape_fast(preds, labels)
    return 'SMAPE', smape_val, False

In [6]:
def fit_sin(tt, yy):
    '''Fit sin to the input time sequence, and return fitting parameters "amp", "omega", "phase", "offset", "freq", "period" and "fitfunc"'''
    tt = numpy.array(tt)
    yy = numpy.array(yy)
    ff = numpy.fft.fftfreq(len(tt), (tt[1]-tt[0]))   # assume uniform spacing
    Fyy = abs(numpy.fft.fft(yy))
    guess_freq = abs(ff[numpy.argmax(Fyy[1:])+1])   # excluding the zero frequency "peak", which is related to offset
    guess_amp = numpy.std(yy) * 2.**0.5
    guess_offset = numpy.mean(yy)
    guess = numpy.array([guess_amp, 2.*numpy.pi*guess_freq, 0., guess_offset])

    def sinfunc(t, A, w, p, c):  return A * numpy.sin(w*t + p) + c
    popt, pcov = scipy.optimize.curve_fit(sinfunc, tt, yy, p0=guess, maxfev = int(1e6))
    A, w, p, c = popt
    f = w/(2.*numpy.pi)
    fitfunc = lambda t: A * numpy.sin(w*t + p) + c
    return {"amp": A, "omega": w, "phase": p, "offset": c, "freq": f, "period": 1./f, "fitfunc": fitfunc, "maxcov": numpy.max(pcov), "rawres": (guess,popt,pcov)}




In [7]:
features = [x + '_sim' for x in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']] + ['ind']

In [8]:
from sklearn.metrics import mean_squared_error as mse
from scipy.optimize import curve_fit

def func(x, a, b, c, z):
    return a * np.exp(-b * x) + c 


def sine_prediction_single_point(tmp, trn_idx, vl_idx, target, shift=False, single=True,
                                prefit=True, test=None, test_ind={}, 
                                find_ind=True, shift_after_ind_split=True):
    
    
    best_ind = {}
    if test is not None:
        tr_x, tr_y = tmp[features], tmp[target] 
        vl_x, vl_y = test[features], None
    else:
        tr_x, tr_y = tmp[features].loc[trn_idx], tmp.loc[trn_idx][target] 
        vl_x, vl_y = tmp[features].loc[vl_idx], tmp.loc[vl_idx][target]
    
    preffitted = None
    if prefit:
        tt = numpy.linspace(0, len(tr_x) // 2 - 1, len(tr_x) // 2)
        tt2 = numpy.linspace(0, len(tr_x) + len(vl_x) - 1, len(tr_x) + len(vl_x))

        res = fit_sin(tt, tr_y[:len(tt)])
        preffitted = res["fitfunc"](tt2)
        tr_y -= preffitted[:len(tr_y)]
        if test is None:
            vl_y -= preffitted[-len(vl_y):]
            
    if find_ind:
        tr_x[f'{target}_max'] = False
        vl_x[f'{target}_max'] = False
        for i in tr_x.index[:-20]:
            if tr_x.loc[i][target] == np.max(tr_x.loc[i - 20: i + 20][target]):
                tr_x.loc[i, f'{target}_max'] = True
                
        const_repeat = (tr_x[tr_x[f'{target}_max']].index % 24).value_counts().values[-2]
        last_true_index = tr_x[tr_x[f'{target}_max']].index[-1]
        indexes_to_fill = []
        curr_count = (tr_x[tr_x[f'{target}_max']].index % 24).value_counts()[last_true_index % 24]
        curr_ind = last_true_index
        #print((tr_x[tr_x[f'{target}_max']].index % 24).values)
        diff = tr_x[tr_x[f'{target}_max']].index[-curr_count -1] % 24 - curr_ind % 24
        
        if diff > 2:
            diff = -1
        elif diff < -2:
            diff = 1
        diff *= -1
        
        const_repeat = max(1, const_repeat)
        const_repeated = 0
        for i in range(max(len(vl_x) // 10, 3)):
            curr_ind += 24
            if curr_count >= const_repeat:
                curr_ind += diff
                curr_count = 0
            
            if curr_ind in tr_x.index:
 
                tr_x.loc[curr_ind, f'{target}_max'] = True
            else:
                vl_x.loc[curr_ind, f'{target}_max'] = True
                
            curr_count += 1
            if const_repeated > max(5, const_repeat):
                    const_repeated = 0
                    const_repeat = max(1, const_repeat - 1)
                
        
        tr_x['ind'] = None
        vl_x['ind'] = None
       
        for i in range(48):
            for ind in tr_x[tr_x[f'{target}_max']].index - i:
                if ind in tr_x.index and tr_x.loc[ind, 'ind'] is None:
                    tr_x.loc[ind, 'ind'] = i % 24
            for ind in vl_x[vl_x[f'{target}_max']].index - i:
                if ind in vl_x.index and vl_x.loc[ind, 'ind'] is None:
                    vl_x.loc[ind, 'ind'] = i % 24

        
    st_point = tr_y[-period:].values.tolist()
    if shift:
        tr_y = (tr_y.shift(period) + 1) - (tr_y + 1)
        if test is None:
            vl_y = ((tmp[target].shift(period) + 1) - (tmp[target] + 1)).loc[vl_idx]
    
    tr_x_all, tr_y_all = tr_x[period:].copy(), tr_y[period:].copy()
    vl_x_all = vl_x.copy()
    vl_y_all = None
    if test is None:
        vl_y_all = vl_y.copy()

    def model_pred(tr_x, tr_y, vl_x, vl_y, i=0):
        if len(vl_x) == 0:
            return
        
        tr_y_last = 0
        if shift_after_ind_split:
            tr_y_last = tr_y.values[-1]
            tr_y = (tr_y.shift() - tr_y)[1:]
            tr_x = tr_x[1:]
            if test is None:
                vl_y = (np.append([tr_y_last], vl_y.values)[:-1] - vl_y)
            
            
        visualise = False
        if visualise: 
            tr_y.plot()
            vl_y.plot()
    
        vl_pred = None
        if len(tr_x) < 5:  
            pred = np.mean(tr_y)
            if len(tr_y) > 1:
                pred = np.poly1d(np.polyfit(tr_x.reset_index().index, tr_y, deg=1))(vl_x.reset_index().index + len(tr_y))
            
            vl_pred = pred
            best_ind[i] = 'line'

        else:
            
            N = len(tmp) # number of data points
            tt = numpy.linspace(0, len(tr_x) - 1, len(tr_x))
            tt2 = numpy.linspace(0, len(tr_x) + len(vl_x) - 1, len(tr_x) + len(vl_x))
            res = fit_sin(tt, tr_y)
            vl_pred_line  =  res["fitfunc"](tt2)[-len(vl_x):]
            
#             from sklearn import linear_model
#             clf = linear_model.BayesianRidge(alpha_init=0.001, lambda_init=0.00001)
#             clf.fit(tt.reshape(-1, 1), tr_y)
#             vl_pred_line = clf.predict(tt2.reshape(-1, 1))[-len(vl_x):]
            
#             if single == False:
#                 res = fit_sin(tt, tr_y)
#                 vl_pred_line  =  res["fitfunc"](tt2)[-len(vl_x):]
#             else:
                
#                 #popt, pcov = curve_fit(func, tt, tr_y, maxfev=1000000)
#                 #vl_pred_line = func(tt2, *popt)[-len(vl_x):]
#         #         vl_pred_sine  =  res["fitfunc"](tt2)[-len(vl_x):]
#                 vl_pred_line = np.poly1d(np.polyfit(tr_x.reset_index().index, tr_y, deg=1))(vl_x.reset_index().index + len(tr_y))
            if test is None:
    #             if mse(vl_y, vl_pred_sine) < mse(vl_y, vl_pred_line):
    #                 vl_pred = vl_pred_sine
    #                 best_ind[i] = 'sine'
    #             else:
                vl_pred = vl_pred_line
                best_ind[i] = 'line'
            else:
    #             if test_ind[i] == 'sine':
    #                 vl_pred = vl_pred_sine
    #             else:
                vl_pred = vl_pred_line
        if visualise:
            vl_x.loc[vl_x.index, 'pred'] = vl_pred
            vl_x['pred'].plot()
            plt.show()

        if shift_after_ind_split:
            vl_pred[0] = tr_y_last - vl_pred[0]
            for i in range(1, len(vl_pred)):
                vl_pred[i] = vl_pred[i - 1] - vl_pred[i]
         
        vl_x_all.loc[vl_x.index, 'pred'] = vl_pred
            
    if single:
        for period_i in range(24):
            tr_x = tr_x_all[features][tr_x_all.ind % 24 == period_i].copy()
            tr_y = tr_y_all[tr_x_all.ind % 24 == period_i].copy()
            vl_x = vl_x_all[features][vl_x_all.ind % 24 == period_i].copy()
            vl_y = None
            if test is None:
                vl_y = vl_y_all[vl_x_all.ind % 24 == period_i].copy()
            model_pred(tr_x, tr_y, vl_x, vl_y, period_i) 
    else:
        model_pred(tr_x_all, tr_y_all, vl_x_all, vl_y_all)
        
        
    def find_actual_pred(preds):
        if shift:
            old_values = list(st_point.copy())
            final_pred = []
            i = 0
            for x in preds:
                final_pred.append(old_values[i] - x)
                old_values.append(final_pred[-1])
                i += 1
        else:
            final_pred = preds


        final_pred = np.array(final_pred)
        if prefit:
            final_pred = final_pred + preffitted[-len(vl_x_all):]
        return final_pred
     
    
   

    vl_pred = find_actual_pred(vl_x_all['pred'].values)
    vl_x_all['pred'] = vl_pred
#     display(vl_x_all[['y', 'pred']])
#     vl_x_all[vl_x_all.ind % 24 == 0][['y', 'pred']].plot()
#     plt.show()
#     vl_pred = scipy.signal.savgol_filter(vl_pred,
#                            7, # window size used for filtering
#                            3)
    return vl_pred, best_ind

In [9]:
#%%time
from sklearn.model_selection import KFold, TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.kernel_ridge import KernelRidge
from autofeat import AutoFeatRegressor as AutoFeatRegression
import numpy
import scipy
import scipy.signal 
import lightgbm as lgb
import matplotlib.pyplot  as plt
folds = 10
seed = 42
kf = TimeSeriesSplit(n_splits=folds)
oof = np.zeros(len(train))
target = 'y'

def create_dataset(dataset, y, look_back=5):
    dataX, timeX, dataY = [], [], []
    for i in range(len(dataset)-look_back):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        timeX.append(np.mean(a[:, -1]))
        dataY.append(y[i + look_back])

    return np.array(dataX), np.array(timeX), np.array(dataY)

visualise = False
visualise_final = False
period = 24
smapes = {}
params = {}
bad_sats = []
for sat in test.sat_id.unique():

    tmp = train[train.sat_id == sat].copy()
    tmp2 = tmp.copy()
    indexes = list(tmp[(tmp['epoch'] - tmp['epoch'].shift()) <= "00:00:10.001000"].index)
    for i, ind in enumerate(indexes):
        for target_ in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
            tmp2.loc[ind:, f'{target_}_sim'] = tmp[[f'{target_}_sim']].shift(i + 1).loc[ind:]


    tmp2.drop(tmp[(tmp['epoch'] - tmp['epoch'].shift()) <= "00:00:10.001000"].index, inplace=True)
    tmp = tmp2.copy().reset_index()
    
    tmp['time_diff'] = (tmp['epoch'] - tmp.iloc[0]['epoch']).dt.total_seconds() / 454 + 1
    # tmp = tmp.set_index('epoch')
    tmp['time_diff'] = (tmp.time_diff) 
    #tmp['ind'] = create_index(tmp)
    tmp['ind'] = tmp.reset_index().index % 24
    
  
    tmp['error'] = (tmp[target + '_sim'] - tmp[target])  # / (tmp[target + '_sim'].max() * 3)

  
        
    tr_len = int(len(tmp) * 0.85)
    trn_idx, vl_idx = tmp.iloc[:tr_len].index, tmp.iloc[tr_len:].index
    
    best_params = None
    best_smape = np.inf
    best_vl_pred = None
    best_ind = {}
    for tgt in ['error', target]:
        for shift in [True, False]:
            for single in [True, False]:
                for prefit in [False]:
                    for find_ind in [True, False]:
                        for shift_after_ind_split in [False, True]:
                         
                            if find_ind and (shift or not single):
                                continue
                            if not find_ind and shift_after_ind_split:
                                continue
                            if tgt == 'error' and find_ind:
                                continue
                            try:
                                vl_pred, ind_types = sine_prediction_single_point(tmp, trn_idx, vl_idx, tgt, shift=shift if not find_ind else False ,
                                                                   single=single, prefit=prefit,
                                                                         find_ind=find_ind,
                                                                         shift_after_ind_split=shift_after_ind_split)
                            except:
                                continue

                            vl_pred = vl_pred[:len(vl_idx)]

                            if tgt == 'error':
                                vl_pred = (tmp.loc[vl_idx][target + '_sim'] - vl_pred).values

                            smape = smape_fast(vl_pred, tmp.loc[vl_idx][target].values)
                            #print(smape, tgt, shift, single, find_ind, shift_after_ind_split)
                            if smape < best_smape:
                                best_smape = smape
                                best_params = (tgt, shift, single, find_ind, shift_after_ind_split)
                                best_vl_pred = vl_pred
                                best_ind = ind_types

#     tmp.loc[vl_idx, 'pred'] = best_vl_pred
#     tmp.loc[vl_idx][['pred', target]].plot()
#     plt.show()
    
#     tmp.loc[vl_idx][[target + '_sim', target]].plot()
#     plt.show()
#     tmp[tmp.index % 24 == 6]['y'].plot()
#     tmp[tmp.index % 24 == 6]['pred'].plot()
#     plt.show()
    
    print(sat, best_params, smape_fast(best_vl_pred, tmp.loc[vl_idx][target].values))
    smapes[sat] = smape_fast(best_vl_pred, tmp.loc[vl_idx][target].values)
    params[sat] = best_params, best_ind

0 ('y', True, True, False, False) 0.012594499339738252
1 ('y', True, True, False, False) 0.00016958008014867273
2 ('error', False, True, False, False) 6.240804039271772e-05
3 ('error', False, True, False, False) 4.787084979659783e-05
4 ('error', True, True, False, False) 3.44747230167951e-05
5 ('error', True, True, False, False) 3.4205895222321595e-05
6 ('error', True, True, False, False) 6.206079801776706e-05
7 ('error', True, True, False, False) 4.131723160372334e-05
8 ('error', True, True, False, False) 4.950258914077672e-05
9 ('error', True, True, False, False) 6.594880310846532e-05
10 ('error', True, True, False, False) 0.00010834602285737544
11 ('error', True, True, False, False) 0.0006594144420908885
12 ('error', True, True, False, False) 0.003002554192961089
13 ('error', True, True, False, False) 0.0012117110551532718
14 ('error', True, True, False, False) 0.0007775796226039255
15 ('y', True, True, False, False) 0.01260106846591885
16 ('y', True, True, False, False) 0.000193869

136 ('y', True, True, False, False) 0.005657070270096809
137 ('error', True, True, False, False) 0.00037846765848047876
138 ('y', True, True, False, False) 0.0008406900421895712
139 ('error', True, True, False, False) 4.278190361107792e-05
140 ('error', True, True, False, False) 4.466619583606204e-05
141 ('error', True, True, False, False) 6.821880825187591e-05
142 ('error', True, True, False, False) 0.0001344030001673872
143 ('error', True, True, False, False) 0.00047759290416846963
144 ('error', False, True, False, False) 0.0006377915250586295
145 ('error', False, True, False, False) 0.00030242034361818644
146 ('error', True, True, False, False) 0.00030398534331791587
147 ('error', False, True, False, False) 0.0003044828193297863
148 ('error', False, True, False, False) 0.0003561706661055084
149 ('error', False, True, False, False) 0.00042966939885704346
150 ('y', True, True, False, False) 0.130165153451691
151 ('y', True, True, False, False) 0.013600990738149845
152 ('error', True, 

In [10]:
features = [x + '_sim' for x in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']] + ['ind', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']

In [11]:
%%time
from sklearn.model_selection import KFold, TimeSeriesSplit

import lightgbm as lgb
import matplotlib.pyplot  as plt
bad_ids = {}

for target in ['y', 'x', 'z', 'Vx', 'Vy', 'Vz']:
    for sat in test.sat_id.unique():
            
        tmp = train[train.sat_id == sat].copy()
        tmp_test = test[test.sat_id == sat].copy()
      
        tr_idx = list(tmp.id)
        test_idx = list(tmp_test.id)
        tmp = pd.concat([tmp, tmp_test]).reset_index()
       
        tmp_comb = tmp.copy()
        indexes = list(tmp[(tmp['epoch'] - tmp['epoch'].shift()) <= "00:00:10.001000"].index)

        for i, ind in enumerate(indexes):
                if tmp.loc[ind]['id'] in tr_idx:
                    tr_idx.remove(tmp.loc[ind]['id'])
                else:
                    test_idx.remove(tmp.loc[ind]['id'])
                    bad_ids[tmp.loc[ind]['id']] = tmp.loc[ind - 1]['id']
                for target_ in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
                    tmp_comb.loc[ind:, f'{target_}_sim'] = tmp[[f'{target_}_sim']].shift(i + 1).loc[ind:]

        tmp_comb.drop(tmp[(tmp['epoch'] - tmp['epoch'].shift()) <= "00:00:10.001000"].index, inplace=True)
       
        tmp = tmp_comb[tmp_comb['id'].isin(tr_idx)].copy()
  
        tmp['time_diff'] = (tmp['epoch'] - tmp.iloc[0]['epoch']).dt.total_seconds() / 454 + 1
        tmp['time_diff'] = (tmp.time_diff % (24 * 80)) 
        tmp['error'] = (tmp[target + '_sim'] - tmp[target])
        tmp['ind'] = (tmp.reset_index().index % (24))
        
        tmp2 = tmp_comb.loc[tmp_comb['id'].isin(test_idx)].copy()
        tmp2['time_diff'] = (tmp2['epoch'] - tmp.iloc[0]['epoch']).dt.total_seconds() / 454 + 1
        tmp2['time_diff'] = (tmp2.time_diff % (24 * 80))
        tmp2['ind'] = ((tmp2.reset_index().index + tmp.reset_index().index[-1] + 1) % (24)) 
        test_x = tmp2.copy()  
        
        tgt, shift, single, find_ind, shift_after_ind_split = params[sat][0]
        
        if tgt != 'error':

            tgt = target
        try:
            test_pred, _ = sine_prediction_single_point(tmp, None, None, tgt, shift=shift,
                                                               single=single, prefit=False,
                                                        find_ind=find_ind, 
                                                       test=test_x, test_ind=params[sat][1], 
                                                       shift_after_ind_split=shift_after_ind_split)
        except:
            test_pred, _ = sine_prediction_single_point(tmp, None, None, tgt, shift=shift,
                                                               single=single, prefit=False,
                                                        find_ind=False, shift_after_ind_split=False,
                                                       test=test_x, test_ind=params[sat][1])
 
        test_pred = test_pred[:len(test_x)]
    
        if tgt == 'error':
            test_pred = (test_x[target + '_sim'] - test_pred).values
    
        test.loc[tmp2['index'], target] = test_pred

CPU times: user 24min 33s, sys: 23.6 s, total: 24min 56s
Wall time: 20min 8s


In [12]:
for key, vl in bad_ids.items():
    for target in ['x', 'y', 'z', 'Vx', 'Vy', 'Vz']:
        test.loc[test.id == key, target] = test.loc[test.id == vl, target].values

In [13]:
test[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv(PATH_SAVE, index=False)

In [14]:
test[['id', 'x', 'y', 'z', 'Vx', 'Vy', 'Vz']].to_csv(PATH_SAVE, index=False)