### Imports and Initial Inspections

In [1]:
import os
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
evtf_1 = pd.read_csv("context--2008-08-22_2010-07-10--evtf.csv")
evtf_2 = pd.read_csv("context--2010-07-10_2012-05-27--evtf.csv")
evtf_3 = pd.read_csv("context--2012-05-27_2014-04-14--evtf.csv")
evtf = pd.concat([evtf_1, evtf_2, evtf_3])

In [2]:
print(evtf.shape)
print(evtf.describe())
print(evtf.info())
print(evtf['description'].nunique())

(240159, 2)
              ut_ms
count  2.401590e+05
mean   1.311298e+12
std    5.176256e+10
min    1.219364e+12
25%    1.266345e+12
50%    1.312177e+12
75%    1.356946e+12
max    1.397434e+12
<class 'pandas.core.frame.DataFrame'>
Index: 240159 entries, 0 to 88174
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ut_ms        240159 non-null  int64 
 1   description  240159 non-null  object
dtypes: int64(1), object(1)
memory usage: 5.5+ MB
None
89355


In [3]:
# Figure out the time granularity
evtf['ut_ms_difference'] = evtf['ut_ms'].diff()
# unique values
evtf['ut_ms_difference'].value_counts()
evtf.drop('ut_ms_difference', axis=1, inplace=True)

### Some timestamps have duplicated values.
### Concatenate the descriptions of those rows to fix the issue.

In [4]:
# Convert 'ut_ms' to datetime
evtf['timestamp'] = pd.to_datetime(evtf['ut_ms'], unit='ms')

# Group by 'ut_ms' and concatenate the 'description' values of duplicated timestamps
evtf_grouped = evtf.groupby('timestamp')['description'].apply(', '.join).reset_index()
evtf = pd.merge(evtf_grouped, evtf, on='timestamp', how='left', suffixes=('', '_y'))
evtf.drop('description_y', axis=1, inplace=True)
evtf = evtf.drop_duplicates(subset='timestamp', keep='first')

# For troubleshooting
evtf.to_csv('evtf_updated_desc.csv', index=False)

In [5]:
'''
#Only keep relevant events
evtf = evtf[evtf['description'].str.contains(
    "__KM__ASCEND|_KM_DESCEND|__PENUMBRA_|_UMBRA_|PERICENTRE_", regex=True)]
    
'''

'\n#Only keep relevant events\nevtf = evtf[evtf[\'description\'].str.contains(\n    "__KM__ASCEND|_KM_DESCEND|__PENUMBRA_|_UMBRA_|PERICENTRE_", regex=True)]\n    \n'

### Create  Features

In [6]:
# Extract altitude and direction from description
evtf['altitude'] = evtf['description'].apply(lambda x: int(x.split('_KM_')[0]) if '_KM_ASCEND' in x or '_KM_DESCEND' in x else None)
evtf['direction'] = evtf['description'].apply(lambda x: 1 if '_KM_ASCEND' in x else (-1 if '_KM_DESCEND' in x else None))
# Forward fill the NaN values to persist altitude until next ascend or descend
evtf[['altitude', 'direction']] = evtf[['altitude', 'direction']].fillna(method='ffill')
# Set reference (starting) altitude to zero
evtf['altitude'].fillna(0, inplace=True)
# Set initial direction to the opposite of the first direction event
first_direction_event = evtf['direction'].dropna().iloc[0]
initial_direction = -1 if first_direction_event == 1 else 1
evtf['direction'].fillna(initial_direction, inplace=True)


# Create binary indicators for umbra and penumbra events
evtf['umbra'] = evtf['description'].apply(lambda x: 1 if '_UMBRA_START' in x else (0 if '_UMBRA_END' in x else None))
evtf['penumbra'] = evtf['description'].apply(lambda x: 1 if '_PENUMBRA_START' in x else (0 if 'PENUMBRA_END' in x else None))
evtf[['umbra', 'penumbra']] = evtf[['umbra', 'penumbra']].fillna(method='ffill')
# Set initial value to the opposite of the first start/end event
first_umbra_event = evtf['umbra'].dropna().iloc[0]
initial_umbra = 0 if first_umbra_event == 1 else 1
evtf['umbra'].fillna(initial_umbra, inplace=True)
first_penumbra_event = evtf['penumbra'].dropna().iloc[0]
initial_penumbra = 0 if first_penumbra_event == 1 else 1
evtf['penumbra'].fillna(initial_penumbra, inplace=True)


# Create 'time since last pericentre' feature
# Find the max pericentre passage timestamp that is less/earlier than the current timestamp
pericentre_passages = evtf[evtf['description'].str.contains('PERICENTRE')]
evtf['time_since_last_pericentre_ms'] = evtf['ut_ms'] - evtf['ut_ms'].apply(lambda x: pericentre_passages[pericentre_passages['ut_ms'] <= x]['ut_ms'].max())

# Impute the first few missing values
# Estimate the last (unknown) pericentre timestamp first
median_pericentre_diff = pericentre_passages['ut_ms'].diff().median()
def impute_missing_pericentre_values(row):
    if pd.isna(row['time_since_last_pericentre_ms']):
        last_known_pericentre = pericentre_passages['ut_ms'].min()
        estimated_last_pericentre = last_known_pericentre - median_pericentre_diff
        return row['ut_ms'] - estimated_last_pericentre
    else:
        return row['time_since_last_pericentre_ms']

evtf['time_since_last_pericentre_ms'] = evtf.apply(impute_missing_pericentre_values, axis=1)

  evtf[['altitude', 'direction']] = evtf[['altitude', 'direction']].fillna(method='ffill')
  evtf[['umbra', 'penumbra']] = evtf[['umbra', 'penumbra']].fillna(method='ffill')


In [7]:
# resample to an hourly time granularity
evtf.set_index('timestamp', inplace=True)

evtf.to_csv('evtf_train_without_resampling.csv', index= True)
evtf_resampled = evtf.resample('H').ffill()
evtf_resampled.drop(['description', 'ut_ms'], axis=1, inplace=True)
evtf_resampled.fillna(method='bfill', inplace=True)

evtf_resampled.to_csv('evtf_train.csv', index= True)

  evtf_resampled.fillna(method='bfill', inplace=True)


In [8]:
evtf_test = pd.read_csv("context--2014-04-14_2016-03-01--evtf")
evtf_test['timestamp'] = pd.to_datetime(evtf_test['ut_ms'], unit='ms')
evtf_test.set_index('timestamp', inplace=True)

# Extract altitude and direction from description
evtf_test['altitude'] = evtf_test['description'].apply(lambda x: int(x.split('_KM_')[0]) if '_KM_ASCEND' in x or '_KM_DESCEND' in x else None)
evtf_test['direction'] = evtf_test['description'].apply(lambda x: 1 if '_KM_ASCEND' in x else (-1 if '_KM_DESCEND' in x else None))
# Forward fill the NaN values to persist altitude until next ascend or descend
evtf_test[['altitude', 'direction']] = evtf_test[['altitude', 'direction']].ffill()
# Set reference (starting) altitude to zero
evtf_test['altitude'].fillna(0, inplace=True)
# Set initial direction to the opposite of the first direction event
first_direction_event = evtf_test['direction'].dropna().iloc[0]
initial_direction = -1 if first_direction_event == 1 else 1
evtf_test['direction'].fillna(initial_direction, inplace=True)

# Create binary indicators for umbra and penumbra events
evtf_test['umbra'] = evtf_test['description'].apply(lambda x: 1 if '_UMBRA_START' in x else (0 if '_UMBRA_END' in x else None))
evtf_test['penumbra'] = evtf_test['description'].apply(lambda x: 1 if '_PENUMBRA_START' in x else (0 if '_PENUMBRA_END' in x else None))
evtf_test[['umbra', 'penumbra']] = evtf_test[['umbra', 'penumbra']].ffill()
# Set initial value to the opposite of the first start/end event
first_umbra_event = evtf_test['umbra'].dropna().iloc[0]
initial_umbra = 0 if first_umbra_event == 1 else 1
evtf_test['umbra'].fillna(initial_umbra, inplace=True)
first_penumbra_event = evtf_test['penumbra'].dropna().iloc[0]
initial_penumbra = 0 if first_penumbra_event == 1 else 1
evtf_test['penumbra'].fillna(initial_penumbra, inplace=True)

# Create 'time since last pericentre' feature
pericentre_passages = evtf_test[evtf_test['description'].str.contains('PERICENTRE')]
evtf_test['time_since_last_pericentre_ms'] = evtf_test['ut_ms'] - evtf_test['ut_ms'].apply(lambda x: pericentre_passages[pericentre_passages['ut_ms'] <= x]['ut_ms'].max())

# Impute the first few missing values
median_pericentre_diff = pericentre_passages['ut_ms'].diff().median()
evtf_test['time_since_last_pericentre_ms'] = evtf_test.apply(lambda row: row['ut_ms'] - (pericentre_passages['ut_ms'].min() - median_pericentre_diff) if pd.isna(row['time_since_last_pericentre_ms']) else row['time_since_last_pericentre_ms'], axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'context--2014-04-14_2016-03-01--evtf'

In [None]:
# resample to an hourly time granularity
evtf_test_resampled = evtf_test.resample('H').ffill()
evtf_test_resampled.drop(['description', 'ut_ms'], axis=1, inplace=True)

evtf_test_resampled.fillna(method='bfill', inplace=True)

evtf_test_resampled.to_csv('evtf_test.csv', index=True)