В каждой строке через пробел перечислены следующие данные:

Время, чч:мм:сс

Идентификатор трека (id)

Широта, градусы с десятичной дробной частью

Долгота, градусы с десятичной дробной частью

Высота, м

Код ответчика

Позывной


In [1]:
import pandas as pd
import os
from tqdm import tqdm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
bads =  ['BadTracksHackaton1801.txt',
         'BadTracksHackaton2101.txt',
         'BadTracksHackaton2301.txt'
         ]

goods = ['GoodTracksHackaton1801.txt',
         'GoodTracksHackaton2101.txt',
         'GoodTracksHackaton2301.txt']

In [3]:
bad = pd.concat((pd.read_csv(f, sep=' ', names=['time',
                                              'track_id',
                                              'latitude',
                                              'longitude',
                                              'height',
                                              'code',
                                              'name']) for f in bads), ignore_index=True)
bad['seconds'] = pd.to_timedelta(bad['time']).dt.total_seconds().astype(int)


good = pd.concat((pd.read_csv(f, sep=' ', names=['time',
                                              'track_id',
                                              'latitude',
                                              'longitude',
                                              'height',
                                              'code',
                                              'name']) for f in goods), ignore_index=True)
good['seconds'] = pd.to_timedelta(good['time']).dt.total_seconds().astype(int)


In [4]:
bad.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1753454 entries, 0 to 1753453
Data columns (total 8 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   time       1753454 non-null  object 
 1   track_id   1753454 non-null  int64  
 2   latitude   1753454 non-null  float64
 3   longitude  1753454 non-null  float64
 4   height     1753454 non-null  int64  
 5   code       1691187 non-null  float64
 6   name       1536240 non-null  object 
 7   seconds    1753454 non-null  int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 107.0+ MB


In [5]:
good.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850653 entries, 0 to 1850652
Data columns (total 8 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   time       1850653 non-null  object 
 1   track_id   1850653 non-null  int64  
 2   latitude   1850653 non-null  float64
 3   longitude  1850653 non-null  float64
 4   height     1850653 non-null  int64  
 5   code       1850202 non-null  float64
 6   name       1127100 non-null  object 
 7   seconds    1850653 non-null  int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 113.0+ MB


In [107]:
# СБОРКА МИНИМАЛЬНОГО НАБОРА ФИЧЕЙ
def make_dataset(df):
    
    df = df[~df.name.isna()]
    
    df['date_time'] = pd.to_datetime(df['time'])
    
    # добавляем период дня
    df['period'] = (df['date_time'].dt.hour % 24 + 4) // 4
    df['period'].replace({1: 'Late Night',
                          2: 'Early Morning',
                          3: 'Morning',
                          4: 'Noon',
                          5: 'Evening',
                          6: 'Night'}, inplace=True)
    
    # добавляем порядковый номер
    df = df.sort_values(by=['track_id', 'code', 'name', 'seconds'])
    df['group_num'] = df.groupby(['track_id','code', 'name']).cumcount()
    
    # добавляем время от начала трека полета
    to_merge = df[(df.group_num == 0)
                    &(df.seconds>1)][['track_id',  'code', 'name', 'seconds']]

    df = df.merge(to_merge, on=['track_id',  'code', 'name'], how='left')
    df.fillna({'seconds_y':0}, inplace=True)
    df['time_from_start'] = df.seconds_x-df.seconds_y
    
    for col in ['time_from_start', 'height', 'longitude', 'latitude']:
        for i in [1, 10, 100, 200]:
            df[f'diff_{i}_{col}'] = abs(np.where(df.track_id == df.track_id.shift(1), df[col].diff(periods=i), 0))
    
    for col in [ 'height', 'longitude', 'latitude']:
        for i in [1, 10, 100, 200]:
            df[f'diff_{i}_{col}/sec'] = abs(df[f'diff_{i}_{col}']/df[f'diff_{i}_time_from_start'])
            
    df.replace([np.inf, -np.inf, np.nan, -np.nan], -1, inplace=True) 
    
    df.drop(columns=['time', 'seconds_x', 'date_time',  'group_num', 'seconds_y'], inplace=True)
    
    df['full_id'] =  df["name"] + '_'+ df["track_id"].astype(str) 
    
    return df


In [94]:
df = make_dataset(good)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_time'] = pd.to_datetime(df['time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['period'] = (df['date_time'].dt.hour % 24 + 4) // 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [108]:
df_bad = make_dataset(bad)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_time'] = pd.to_datetime(df['time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['period'] = (df['date_time'].dt.hour % 24 + 4) // 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


## Сборка расширенного набора признаков

In [2]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters ,ComprehensiveFCParameters

from tqdm import tqdm_notebook as tqdm

In [125]:
params = {'variance_larger_than_standard_deviation': None,
 'sum_values': None,
 'abs_energy': None,
 'mean_abs_change': None,
 'mean_change': None,
 'mean_second_derivative_central': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variance': None,
 'skewness': None,
 'kurtosis': None,
 'absolute_sum_of_changes': None,
 'longest_strike_below_mean': None,
 'longest_strike_above_mean': None,
 'count_above_mean': None,
 'count_below_mean': None,
 'last_location_of_maximum': None,
 'first_location_of_maximum': None,
 'last_location_of_minimum': None,
 'first_location_of_minimum': None,
 'ratio_value_number_to_time_series_length': None,
 'maximum': None,
 'minimum': None,
 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'cid_ce': [{'normalize': True}, {'normalize': False}],
 'quantile': [
  {'q': 0.8},
  {'q': 0.9},
  {'q': 0.99}],
 'autocorrelation': [
  {'lag': 200},
  {'lag': 100},
  {'lag': 10}],
 'partial_autocorrelation': [
  {'lag': 200},
  {'lag': 100},
  {'lag': 10}],
 'number_cwt_peaks': [{'n': 1}, {'n': 5}],
 'number_peaks': [{'n': 5}, {'n': 10}, {'n': 50}],
 'ar_coefficient': [
  {'coeff': 2, 'k': 10},
  {'coeff': 3, 'k': 10},
  {'coeff': 4, 'k': 10}],
 'linear_trend': [{'attr': 'pvalue'},
                  {'attr': 'stderr'}],
 'augmented_dickey_fuller': [{'attr': 'teststat'},
  {'attr': 'pvalue'},
  {'attr': 'usedlag'}],
 'ratio_beyond_r_sigma': [{'r': 0.5},
  {'r': 1.5},
  {'r': 2},
  {'r': 2.5}]}

In [126]:
def make_features(df):
    
    ts = df.drop(['code', 'period', 'name'], axis=1)
    
    columns_to_use = ['full_id', 'time_from_start',
                  'latitude', 'longitude', 'height',
                  'diff_1_time_from_start', 'diff_3_time_from_start','diff_10_time_from_start', 
                  'diff_1_height/sec', 'diff_3_height/sec',
               'diff_10_height/sec', 'diff_1_longitude/sec', 'diff_3_longitude/sec',
               'diff_10_longitude/sec', 'diff_1_latitude/sec', 'diff_3_latitude/sec',
               'diff_10_latitude/sec']
    
    tst =  extract_features(ts[columns_to_use],
                            column_id = "full_id", 
                            column_sort = "time_from_start", 
                            impute_function=impute, 
                            default_fc_parameters=params,
                            n_jobs=8,
                            show_warnings=False)
    
    return tst 


In [127]:
df_good_feat = make_features(df)

Feature Extraction: 100%|██████████| 40/40 [09:33<00:00, 14.33s/it] 


In [128]:
df_bad_feat = make_features(df_bad)

Feature Extraction: 100%|██████████| 40/40 [12:42<00:00, 19.06s/it]  


In [134]:
df = df.merge(df_good_feat.reset_index(), left_on='full_id', right_on='index', how='left')
df.to_parquet('good_ts_features.parquet')

In [135]:
df_bad = df_bad.merge(df_bad_feat.reset_index(), left_on='full_id', right_on='index', how='left')
df_bad.to_parquet('bad_ts_features.parquet')

In [147]:
df['target'] = 0
df_bad['target'] = 1

In [148]:
full_id_good_train, full_id_good_test = train_test_split(df.full_id.unique(), test_size=0.3, random_state=42)
len(full_id_good_train), len(full_id_good_test)

(1589, 682)

In [149]:
full_id_bad_train, full_id_bad_test = train_test_split(df_bad.full_id.unique(), test_size=0.3, random_state=42)
len(full_id_bad_train), len(full_id_bad_test)

(2028, 870)

In [151]:
train = pd.concat([df[df.full_id.isin(full_id_good_train)],
                   df_bad[df_bad.full_id.isin(full_id_bad_train)]], ignore_index=True)

test = pd.concat([df[df.full_id.isin(full_id_good_test)],
                   df_bad[df_bad.full_id.isin(full_id_bad_test)]], ignore_index=True)

In [152]:
train.to_parquet('train_feat_ts.parquet')
test.to_parquet('test_feat_ts.parquet')

In [4]:
train = pd.read_parquet('train_feat_ts.parquet', engine='pyarrow')

In [5]:
test = pd.read_parquet('test_feat_ts.parquet', engine='pyarrow')

In [6]:
train.head()

Unnamed: 0,track_id,latitude,longitude,height,code,name,period,time_from_start,diff_1_time_from_start,diff_3_time_from_start,...,"diff_10_height/sec__linear_trend__attr_""pvalue""","diff_10_height/sec__linear_trend__attr_""stderr""","diff_10_height/sec__augmented_dickey_fuller__attr_""teststat""__autolag_""AIC""","diff_10_height/sec__augmented_dickey_fuller__attr_""pvalue""__autolag_""AIC""","diff_10_height/sec__augmented_dickey_fuller__attr_""usedlag""__autolag_""AIC""",diff_10_height/sec__ratio_beyond_r_sigma__r_0.5,diff_10_height/sec__ratio_beyond_r_sigma__r_1.5,diff_10_height/sec__ratio_beyond_r_sigma__r_2,diff_10_height/sec__ratio_beyond_r_sigma__r_2.5,target
0,1,55.93462,37.2721,1219,1551.0,AFL1522,Late Night,0.0,0.0,0.0,...,4.192134e-68,0.001297,-1.704858,0.428611,17.0,0.926966,0.126404,0.044944,0.022472,0
1,1,55.933381,37.267583,1219,1551.0,AFL1522,Late Night,3.0,3.0,-1.0,...,4.192134e-68,0.001297,-1.704858,0.428611,17.0,0.926966,0.126404,0.044944,0.022472,0
2,1,55.933338,37.266435,1204,1551.0,AFL1522,Late Night,4.0,1.0,-1.0,...,4.192134e-68,0.001297,-1.704858,0.428611,17.0,0.926966,0.126404,0.044944,0.022472,0
3,1,55.931407,37.252316,1257,1551.0,AFL1522,Late Night,14.0,10.0,14.0,...,4.192134e-68,0.001297,-1.704858,0.428611,17.0,0.926966,0.126404,0.044944,0.022472,0
4,1,55.933145,37.236722,1303,1551.0,AFL1522,Late Night,24.0,10.0,21.0,...,4.192134e-68,0.001297,-1.704858,0.428611,17.0,0.926966,0.126404,0.044944,0.022472,0
