## Import standard libraries

In [1]:
# Import custom libraries from local folder.
from importlib import reload
import os
import sys
sys.path.append("..")

# Import nn module from torch to replicate kessler tool
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Import json library and create function to format dictionaries.
import json
format_json = lambda x: json.dumps(x, indent=4)

# Import utils library containing miscellaneous functions/classes
from scalib import utils

# Import SCALIB modules for NN development
import scalib.xnn as xnn            # NN models
import scalib.utils as utils
import scalib.eda as eda
import scalib.cfg as cfg

# Set overall seed for reproducibility
utils.seed(1)

# Import matplotlib library and setup environment for plots
%matplotlib inline
%config InlineBackend.figure_format='retina'
from matplotlib import rc

# Set rendering parameters to use TeX font if not working on Juno app.
if not '/private/var/' in utils.cwd:
    rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 11})
    rc('text', usetex=True)
    
# Get current working directory path for the tool parent folder and print it.
print('Parent working directory: %s' % utils.cwd)


Parent working directory: /Users/jjrr/Documents/SCA-Project/scalib


## Import training dataset

In [111]:
# Import library to import Kelvins challlenge data
from scalib.eda import kelvins_challenge_events

# Import Kelvins dataset and convert it to dataframes in CCSDS format
for filename in ['test_data.csv', 'train_data.csv']:

    filepath_dest = os.path.join(utils.cwd,'data','ccsds', filename)
    filepath_orig = os.path.join(utils.cwd,'data','esa-challenge', filename)

    if os.path.exists(filepath_dest):
        if filename.startswith('test'):
            df_test = pd.read_csv(filepath_dest, index_col=0)
        else:
            df_train = pd.read_csv(filepath_dest, index_col=0)
    else:

        # Get ConjunctionEventsDataset object 
        events = kelvins_challenge_events(filepath_orig,
                    drop_features = ['c_rcs_estimate', 't_rcs_estimate'])

        # Convert Conjunction Events Dataset to pandas DataFrame and save it.
        df = events.to_dataframe(event_id=True)
        df.to_csv(filepath_dest)
        
        if filename.startswith('test'):
            df_test = df
        else:
            df_train = df

df_test = df_test.astype(cfg.df_dtype_conversion)
df_train = df.astype(cfg.df_dtype_conversion)
display(df.head(5))

Unnamed: 0,CCSDS_CDM_VERS,CREATION_DATE,ORIGINATOR,MESSAGE_FOR,MESSAGE_ID,TCA,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,...,OBJECT2_CTHR_NDOT,OBJECT2_CTHR_DRG,OBJECT2_CTHR_SRP,OBJECT2_CTHR_THR,__MAX_RISK_ESTIMATE,__MAX_RISK_SCALING,__CREATION_DATE,__TCA,__DAYS_TO_TCA,__EVENT_ID
0,1.0,2023-09-20 07:30:55.604684,,,,2023-09-21 21:07:06.971684,14923.0,13792.0,453.8,5976.6,...,,,,,-7.834756,8.602101,0.0,1.566798,1.566798,0
1,1.0,2023-09-20 16:08:19.532684,,,,2023-09-21 21:07:06.971684,14544.0,13792.0,474.3,5821.2,...,,,,,-7.848937,8.956374,0.359305,1.566798,1.207494,0
2,1.0,2023-09-20 22:15:57.518684,,,,2023-09-21 21:07:06.971684,14475.0,13792.0,474.6,5796.2,...,,,,,-7.847406,8.932195,0.614605,1.566798,0.952193,0
3,1.0,2023-09-21 07:12:23.538684,,,,2023-09-21 21:07:06.971684,14579.0,13792.0,472.7,5838.9,...,,,,,-7.84588,8.913444,0.987129,1.566798,0.579669,0
4,1.0,2023-09-21 14:55:52.529684,,,,2023-09-21 21:07:06.971684,14510.0,13792.0,478.7,5811.1,...,,,,,-7.852942,9.036838,1.308992,1.566798,0.257806,0


In [169]:
from sklearn.utils.class_weight import compute_class_weight
def get_classes(df_input:pd.DataFrame, feature:str='COLLISION_PROBABILITY', 
                threshold:float=-4):

    # Create new column with the collision risk threshold flag
    df = df_input.copy()
    df['COLLISION_FLAG'] = (df[[feature]]>=-threshold)*True

    # Get dummies columns
    dummies = pd.get_dummies(df['COLLISION_FLAG']) \
            .rename(columns={0:'COLLISION_FALSE',
                             1: 'COLLISION_TRUE'})
    df = df.join(dummies)
    new_features = list(dummies.columns)
    outputs = df['FLAG'].to_numpy()
    class_weights = torch.tensor(compute_class_weight(
                                    class_weight='balanced',
                                    classes = np.unique(outputs),
                                    y = outputs),
                                    dtype=torch.float)
    
    return df, new_features, class_weights

In [112]:
reload(cfg)

# Get list of obligatory features.
input_features = cfg.get_features(only_names=True, include_object_preffix=True, 
                            **dict(obligatory=True, dtype=['float','category']))

# Segregate numerical from categorical features
input_features += ['OBJECT1_OBJECT_TYPE', 'OBJECT2_OBJECT_TYPE']
input_features = list(map(lambda x: x.replace('TCA', '__DAYS_TO_TCA'), input_features))

output_features = ['COLLISION_PROBABILITY']

df_train, output_features_class, class_weights = get_classes(df_input=df_train)
df_test, _, _ = get_classes(df_input=df_test)


TrainDataObject = xnn.TensorDatasetFromDataFrame(df_train, 
                                      input_features = input_features, 
                                      output_features = output_features)

TestDataObject = xnn.TensorDatasetFromDataFrame(df_test, 
                                      input_features = input_features, 
                                      output_features = output_features)

# Initialise input and output sizes
input_size = TrainDataObject.input_size
output_size = TrainDataObject.output_size

# Get data from DataObject
data_train = TrainDataObject.data
data_test = TestDataObject.data

Output features: ['COLLISION_FALSE', 'COLLISION_TRUE'] Class weights: tensor([  0.5023, 108.7646])


Unnamed: 0,CCSDS_CDM_VERS,CREATION_DATE,ORIGINATOR,MESSAGE_FOR,MESSAGE_ID,TCA,MISS_DISTANCE,RELATIVE_SPEED,RELATIVE_POSITION_R,RELATIVE_POSITION_T,...,OBJECT2_CTHR_THR,__MAX_RISK_ESTIMATE,__MAX_RISK_SCALING,__CREATION_DATE,__TCA,__DAYS_TO_TCA,__EVENT_ID,FLAG,COLLISION_FALSE,COLLISION_TRUE
0,1.0,2023-09-20 07:30:55.604684,,,,2023-09-21 21:07:06.971684,14923.0,13792.0,453.8,5976.6,...,,-7.834756,8.602101,0.0,1.566798,1.566798,0,False,1,0
1,1.0,2023-09-20 16:08:19.532684,,,,2023-09-21 21:07:06.971684,14544.0,13792.0,474.3,5821.2,...,,-7.848937,8.956374,0.359305,1.566798,1.207494,0,False,1,0
2,1.0,2023-09-20 22:15:57.518684,,,,2023-09-21 21:07:06.971684,14475.0,13792.0,474.6,5796.2,...,,-7.847406,8.932195,0.614605,1.566798,0.952193,0,False,1,0
3,1.0,2023-09-21 07:12:23.538684,,,,2023-09-21 21:07:06.971684,14579.0,13792.0,472.7,5838.9,...,,-7.84588,8.913444,0.987129,1.566798,0.579669,0,False,1,0
4,1.0,2023-09-21 14:55:52.529684,,,,2023-09-21 21:07:06.971684,14510.0,13792.0,478.7,5811.1,...,,-7.852942,9.036838,1.308992,1.566798,0.257806,0,False,1,0


## Aritificial Neural Network model for Collision Risk Probability Estimation (CRPE)

#### Embedding categorical input features

An embedding is a vector representation of a categorical variable. The representation of this vector is computed through the use of NN models/techniques that take into account potential relation between categories in order to create the vector representation for each category.

In practice, an embedding matrix is a lookup table for a vector. Each row of an embedding matrix is a vector for a unique category.

The main advantadge of using embeddings instead of One Hot/Dummy Encoding techniques (one column per unique value of categorical feature with 0s and 1s) is that it can preserve the natural order and common relationships between the categorical features. For example, we could represent the days of the week with 4 floating-point numbers each, and two consecutive days would look more similar than two weekdays that are days apart from each other.


The rule of thumb for determining the embedding size (number of elemens per array) is to divide the number of unique entries in each column by 2, but not to exceed 50.

### Classification model

In [None]:
reload(xnn)

C_TrainDataObject = xnn.TensorDatasetFromDataFrame(df_train, 
                                      input_features = input_features, 
                                      output_features = output_features_class)

C_TestDataObject = xnn.TensorDatasetFromDataFrame(df_test, 
                                      input_features = input_features, 
                                      output_features = output_features_class)

# Get data from DataObject
c_data_train = C_TrainDataObject.data
c_data_test = C_TestDataObject.data


# Define the layers of the network
layers = [input_size*4, input_size*4, input_size]

# Initialise CRPE model
c_model = xnn.CollisionRiskProbabilityEstimator(
                     input_size = c_data_train.input_size, 
                     output_size = c_data_train.output_size, 
                     layers = layers,
                     act_functions = nn.ReLU(),
                     dropout_probs = 0.2,
                     classification=True,
                     class_weights = class_weights)

# Get filepath for the model and print it  
filename = f'crpe_l{"_l".join([str(l) for l in layers])}'
filepath = os.path.join(utils.cwd, 'models', 
                        f'{filename}_classification_parameters.pt')

# Print filepath where the model is going to be saved
print(filepath)

# Print structure of the model
print(c_model)

In [166]:
# Train classification model
c_model.learn(data=c_data_train, epochs=1, lr=1e-6, batch_size=1000, 
              device='cpu', epoch_step_checkpoint=2, filepath=filepath)

# Plot loss
c_model.plot_loss()

Number of learnable parameters of the model: 54,381

Model parameters loaded from /Users/jjrr/Documents/SCA-Project/scalib/models/crpe_l188_l188_l47_class_parameters.pt
 - Total epochs       = 14
 - Total iterations   = 1512
 - Validation loss    = 2.5038e-01
 - Last learning rate = 1.0000e-05


TRAINING COLLISION RISK PROBABILITY ESTIMATOR MODEL:
| Progress                   |    Time     | Iters/sec | Comments
| 100% |██████████| (108/108)| 00h:04m:15s |   0.42    | E(1/1) B(108/108) | Loss > T(2.8731e-01) V(2.5928e-01)

Saving model parameters ... Done.


### Regression model

In [None]:
reload(xnn)

R_TrainDataObject = xnn.TensorDatasetFromDataFrame(df_train, 
                                      input_features = input_features, 
                                      output_features = output_features)

R_TestDataObject = xnn.TensorDatasetFromDataFrame(df_test, 
                                      input_features = input_features, 
                                      output_features = output_features)

# Get data from DataObject
r_data_train = R_TrainDataObject.data
r_data_test = R_TestDataObject.data

# Define the layers of the network
layers = [input_size*4, input_size*4, input_size]

# Initialise CRPE model
r_model = xnn.CollisionRiskProbabilityEstimator(
                     input_size = r_data_train.input_size, 
                     output_size = r_data_train.output_size, 
                     layers = layers,
                     act_functions = nn.ReLU(),
                     dropout_probs = 0.2)

# Get filepath for the model and print it  
filename = f'crpe_l{"_l".join([str(l) for l in layers])}'
filepath = os.path.join(utils.cwd, 'models', 
                        f'{filename}_regression_parameters.pt')

# Print filepath where the model is going to be saved
print(filepath)

# Print structure of the model
print(r_model)

In [None]:
# Train classification model
r_model.learn(data=r_data_train, epochs=1, lr=1e-6, batch_size=1000, 
              device='cpu', epoch_step_checkpoint=2, filepath=filepath)

# Plot loss
r_model.plot_loss()

## Model evaluation

### Classification model

In [168]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score as accuracy, precision_score as precision
from sklearn.metrics import f1_score as f1, recall_score as recall

# Get classification results from classification model.
y_pred = c_model.forward(c_data_test.inputs).detach().numpy()
y_true = c_data_test.outputs.detach().numpy()

y_pred = np.asarray([i.argmax() for i in y_pred])
y_true = np.asarray([i.argmax() for i in y_true])

# Get the values from the confusion matrix.
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print(f'Model accuracy  = {accuracy(y_true, y_pred)*100:5.2f}%\n')
print(f'TP: {tp:^6} FP: {fp:^6}\n'
      f'FN: {fn:^6} TN: {tn:^6}')

Model accuracy  = 87.14%

TP:  112   FP:  2423 
FN:   88   TN: 16908 


### Regression model

In [None]:
# Get the collision risk probabilities from the model and the test dataset
y_pred = r_model.forward(r_data_test.inputs).detach().numpy().flatten()
y_true = r_data_test.outputs.detach().numpy().flatten()

# Define risk threshold for collision avoidance manouvre.
risk_threshold = -4.0

df_val = pd.DataFrame()
df_val['pred_risk'] = y_pred
df_val['true_risk'] = y_true
df_val['diff_risk'] = np.abs(df_val['pred_risk'] - df_val['true_risk'])
df_val['pred_flag'] = df_val['pred_risk']>=risk_threshold
df_val['true_flag'] = df_val['true_risk']>=risk_threshold


# Evaluate accuracy using the confusion matrix.
y_true = df_val['true_flag']
y_pred = df_val['pred_flag']

# Get the values from the confusion matrix.
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print(f'Model accuracy  = {accuracy(y_true, y_pred)*100:5.2f}%\n')
print(f'TP: {tp:^6} FP: {fp:^6}\n'
      f'FN: {fn:^6} TN: {tn:^6}')