In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy.interpolate import splrep, splev
import datetime as dt
import matplotlib.pyplot as plt
from fbprophet import Prophet
from scipy.interpolate import InterpolatedUnivariateSpline
%matplotlib inline
plt.rcParams.update({'font.size': 18})

ERROR:fbprophet:Importing plotly failed. Interactive plots will not work.


In [2]:
def smape(satellite_predicted_values, satellite_true_values):
    # the division, addition and subtraction are pointwise
    return np.mean(np.abs((satellite_predicted_values - satellite_true_values)/
                (np.abs(satellite_predicted_values) + np.abs(satellite_true_values))))


def drop_close(t, x, eps=10**9):
    '''
    t = time array, x = data array, eps is in nanoseconds
    Returns entries in t,x with corresponding consecutive times > eps
    
    '''
    t = np.array(t) #if not already np array, convert
    x = np.array(x) #if t or x are pandas Series, will have dimension mismatch
    far = np.concatenate([(t[1:] - t[:-1]) > eps, [True]])
    return t[far], x[far]


def resample(t, x, step=10 * 10**9, t_new=None, return_t=False):
    '''
    t: time array (or series); 
    x: data array (or series); 
    t_new: new time scale from start to end of t with step size step;
    step: = 10 seconds by default; 
    return_t: by default, do not return resampled times
    
    resample time series or array by 10 (default) sec intervals and 
    return new time series (if t_new=True) and spline approximation series for data
    
    
    '''
    t, x = drop_close(t, x)
    if t_new is None:
        t_new = np.arange(t[0], t[-1], step)
    try:
        spl = splrep(t, x)
        x_new = splev(t_new, spl)
    except:
        raise ValueError(f'interpolation error, x length = {len(x)}, \
        t_new length = {len(t_new)}')

    return (t_new, x_new) if return_t else x_new


def get_peaks(array):
    '''
    returns index of "sharp" peaks, excluding first and last values of array
    
    index of "smooth peaks", e.g. 1 2 3 9 9 3 2 1, is not returned
    '''
    return np.where((array[1:-1] > array[2:]) & (array[1:-1] > array[:-2]))[0] + 1


def get_satellite_data(data, sat_id):
    '''
    returns all data for particular satellite by id
    '''
    return data[data['sat_id'] == sat_id]


def remove_time_jumps(satellite_data, features_list=('x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'),
                      suffix='_jumps_removed',time_threshold = 0.00003):
    #time_threshold 0.00003 sufficient for test and train
    #time_threshold 0.00002 will throw errors
    '''
    removes time jumps in the simulation for a single satellite
    for train and test data, sufficient to set time_threshold at default
    '''
    satellite_data['t'] = ((pd.to_datetime(satellite_data['epoch']) - pd.to_datetime(satellite_data['epoch'])[0]) /
                           np.timedelta64(1, 'D')).astype(float)
    satellite_data['dt'] = satellite_data['t'].diff()
    # the most frequent time interval
    t_standard = satellite_data['dt'].value_counts().index[0]

    # time steps used for simulations
    satellite_data['t_sim'] = satellite_data.index.values * t_standard

    n = satellite_data.shape[0]
    corrected_features = []
    for feature_name in features_list:
        corrected_feature = [0] * n
        corrected_feature[0] = satellite_data[feature_name][0]

        feature = satellite_data[feature_name]

        for j in range(1, n - 1):
            if satellite_data.t[j] < satellite_data.t_sim[j] - time_threshold:
                # approximate by the left side
    
                # look for the interval
                step = 0
                while satellite_data.t[j] < satellite_data.t_sim[j - step] - time_threshold:
                    step += 1
                #             print(step)
                corrected_feature[j] = feature[j - step] - (satellite_data.t_sim[j - step] - satellite_data.t[j]) / (
                            satellite_data.t_sim[j - step] - satellite_data.t_sim[j - step - 1]) * (
                                              feature[j - step] - feature[j - step - 1])
            elif satellite_data.t[j] > satellite_data.t_sim[j] + time_threshold:
                # approximate by the right side
    
                # look for the interval
                step = 0
                while satellite_data.t[j] > satellite_data.t_sim[j + step] + time_threshold:
                    step += 1
    
                corrected_feature[j] = feature[j + step + 1] - (satellite_data.t_sim[j + step + 1] - satellite_data.t[j]) / (
                            satellite_data.t_sim[j + step + 1] - satellite_data.t_sim[j + step]) * (
                                              feature[j + step + 1] - feature[j + step])
            else:
                corrected_feature[j] = feature[j]
    
        corrected_feature[n - 1] = feature[n - 1] + corrected_feature[n - 2] - feature[n - 2]
        corrected_features.append(corrected_feature)
    return pd.DataFrame(corrected_features, index=[f + suffix for f in features_list]).T

In [3]:
def insert_previous_and_shift(df,col_name,ind):
    '''
    input a data frame (df), column name (col_name), and index (ind)
    insert previous value of df[col_name] at index and shift the rest 
    of df[col_name] from ind by +1;
    This is used for remove_time_jumps_fast
    '''
    shifted_series = df[col_name].shift(1)
    df[col_name].iloc[ind] = df[col_name].iloc[ind-1]
    df[col_name].iloc[ind+1:] = shifted_series.iloc[ind+1:]
    return df

In [4]:
def remove_time_jumps_fast(data, features_list=
                           ('x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim'),
                           threshold = 0.000010):
    #time_threshold 0.00003 sufficient for test and train
    #time_threshold 0.00002 will throw errors
    '''
    removes time jumps in the simulation for a single satellite
    for train and test data, sufficient to set time_threshold at default
    s_data = satellite data
    the features are replaced by the correction
    note that threshold here is not the same as in remove_time_jumps
    '''
#     data['t'] = ((pd.to_datetime(data['epoch']) - pd.to_datetime(data['epoch'])[0]) /
#                                np.timedelta64(1, 'D')).astype(float)
    data['dt'] = data['epoch'].diff(1)*(10**-12)

    index_for_correction = data[data['dt'] < threshold].index 
    #print(index_for_correction)
    if list(index_for_correction): #if non empty
        for feature in features_list:
            for i in index_for_correction:
                data = insert_previous_and_shift(data,feature,i)
    return data

## Transforming all the data

In [14]:
import utils
from LinearAlignment import LinearAlignment

In [6]:
features_list=['x', 'y', 'z', 'Vx', 'Vy', 'Vz']

In [7]:
data = pd.read_csv('data/train.csv', index_col = 'id')
# data['time'] = data['epoch']
data['epoch'] = pd.to_datetime(data['epoch']).values.astype(float)


In [9]:
test_data = pd.read_csv('data/test.csv', index_col = 'id')
# test_data['time'] = test_data['epoch']
test_data['epoch'] = pd.to_datetime(test_data['epoch']).values.astype(float)
satellites_list = test_data['sat_id'].unique()


In [10]:
useful_data = pd.DataFrame([])
for sat_id in satellites_list:
    useful_data = useful_data.append(data[data.sat_id == sat_id][:], sort = False)
    

In [11]:
full_data = pd.concat([useful_data,test_data], axis = 0, sort = False)

In [12]:
sum(full_data.memory_usage(deep=True)/ 1024 ** 2)

68.57917785644531

In [13]:

alignment_model = LinearAlignment()

satellites_list = test_data['sat_id'].unique()

transf_df = pd.DataFrame([])
for sat_id in tqdm(satellites_list):
    
#     sat_data = utils.get_satellite_data(data, sat_id)
    try:
        sat_data = get_satellite_data(full_data, sat_id)
        n_first = len(sat_data[~pd.isna(sat_data.x)])
#         n_first = sat_data.shape[0]
        
#         test_sat_data = get_satellite_data(test_data, sat_id)
        
#         sat_data = pd.concat([sat_data,test_sat_data], axis = 0, sort = False)
                
        index = sat_data.index
        pred = pd.DataFrame(index = index[n_first:] )
        
        sat_data = remove_time_jumps_fast(sat_data.reset_index(drop=True))
#         sat_data.set_index(index)
#         sat_data = get_satellite_data(data, sat_id) # run to check the smape score before any transformations
    except KeyError as e:
        print(f'jump removal failed for satellite {sat_id}:\t{type(e).__name__} {e}')
        continue
    
    
# #     n_train = 4*len(sat_data) // 10
#     n_train = n_first
    
    train_sat_data = sat_data[:n_first]
    
    
#     pred['epoch'] = sat_data['epoch'].values
#     pred['t'] = sat_data['t'].values
#     pred['sat_id'] = sat_id
    
    try:

        for feature_name in features_list:
            alignment_model.fit(t=train_sat_data['epoch'].values,
                                x=-train_sat_data[f'{feature_name}_sim'].values,
                                gt=-train_sat_data[feature_name].values)
            option1 = alignment_model.predict(t=sat_data['epoch'].values,
                                                         x=sat_data[f'{feature_name}_sim'].values)
            alignment_model.fit(t=train_sat_data['epoch'].values,
                                x= train_sat_data[f'{feature_name}_sim'].values,
                                gt= train_sat_data[feature_name].values)

            option2 = alignment_model.predict(t=sat_data['epoch'].values,
                                                         x=sat_data[f'{feature_name}_sim'].values)

            if utils.smape(option1[:n_first],sat_data[feature_name].values[:n_first]) < utils.smape(option2[:n_first],sat_data[feature_name].values[:n_first]):
        #                 alignment_model.fit(t=sat_data['epoch'].iloc[:n_first].values,
        #                                 x=-sat_data[f'{feature_name}_sim'].iloc[:n_first].values,
        #                                 gt=-sat_data[feature_name].iloc[:n_first].values)

        #                 pred[f'{feature_name}_sim'] = alignment_model.predict(t=sat_data['epoch'].values,
        #                                                          x=sat_data[f'{feature_name}_sim'].values)[n_first:]
                pred.loc[:,feature_name] = option1[n_first:]
            else:
        #                 alignment_model.fit(t=sat_data['epoch'].iloc[:n_first].values,
        #                                 x=sat_data[f'{feature_name}_sim'].iloc[:n_first].values,
        #                                 gt=sat_data[feature_name].iloc[:n_first].values)
        #                 pred[f'{feature_name}_sim']  = alignment_model.predict(t=sat_data['epoch'].values,
        #                                                          x=sat_data[f'{feature_name}_sim'].values)[n_first:]
                pred.loc[:,feature_name] = option2[n_first:]

        #             pred.loc[:,feature_name]=sat_data.loc[:, feature_name].values
        
                  
            
    except Exception as e:
        print(f'linear alignment failed for satellite {sat_id}:\t{type(e).__name__} {e}')
        continue
    
    transf_df = transf_df.append(pred, sort = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [02:25<00:00,  1.74it/s]


In [None]:
transf_df.shape

In [None]:
sub = pd.read_csv('data/submission.csv')

In [None]:
sub_index = sub.id.values
sub.head()

In [None]:
q = transf_df.loc[sub_index,:]
q.columns = ['x','y','z','Vx','Vy','Vz']
# q = q.reset_index()
q.head()

In [None]:
plt.figure(figsize=(15, 5))

plt.plot(sub.x.iloc[2500:2900].values)

plt.plot(q.x.iloc[2500:2900].values)

In [None]:
import datetime

submission_filename = 'submission_{}.csv'.format(
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
q.to_csv(submission_filename)
print('Submission saved to {}'.format(submission_filename))

q.head(10)