In [1]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [2]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
# Root label (source = ASRS coding forms) : order = by descending frequency
anomaly_labels=['Deviation / Discrepancy - Procedural',
                    'Aircraft Equipment',
                    'Conflict',
                    'Inflight Event / Encounter',
                    'ATC Issue',
                    'Deviation - Altitude',
                    'Deviation - Track / Heading',
                    'Ground Event / Encounter',
                    'Flight Deck / Cabin / Aircraft Event',
                    'Ground Incursion',
                    'Airspace Violation',
                    'Deviation - Speed',
                    'Ground Excursion',
                    'No Specific Anomaly Occurred']

In [4]:
# Function to check prefixes and include 'Other' category
def check_prefixes(anomaly, prefixes):
    if pd.isna(anomaly):
        # Return a series of 0s if the anomaly is NaN
        return pd.Series({prefix: 0 for prefix in prefixes + ['Other']})
    
    split_anomalies = [item.strip() for item in anomaly.split(';')]
    prefix_matches = {prefix: any(item.startswith(prefix) for item in split_anomalies) for prefix in prefixes}
    prefix_matches['Other'] = not any(prefix_matches.values())  # If no prefix matches, this is 'Other'
    return pd.Series(prefix_matches)

drop the NaN values in Anomaly?

In [5]:
loaded_data = pd.read_pickle("./data/train_data_final.pkl")

train_df = loaded_data[0]
print("\nA Dataframe with", len(train_df), "entries has been loaded")

# Apply this function to each row in the 'Anomaly' column
train_anomaly_encoding = train_df['Anomaly'].apply(lambda x: check_prefixes(x, anomaly_labels))
train_df['anomaly_encoding'] = train_anomaly_encoding.values.tolist()
train_df.head()



A Dataframe with 97417 entries has been loaded


Unnamed: 0_level_0,Date,Local Time Of Day,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,Weather Elements / Visibility,...,Result,Contributing Factors / Situations,Primary Problem,Narrative,Callback,Narrative.1,Callback.1,Synopsis,Year,anomaly_encoding
ACN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1163382,201404,1201-1800,ZZZ.Airport,US,,,0.0,,VMC,,...,Flight Crew Returned To Gate; Flight Crew Reje...,Aircraft; Environment - Non Weather Related; P...,Environment - Non Weather Related,I was the pilot flying performing the takeoff....,,At approximately 75 KTS I glanced at my airspe...,,A B767 Captain; the pilot not flying; rejected...,2014,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
893734,201006,1801-2400,SFO.Airport,CA,,,0.0,,,,...,General None Reported / Taken,Human Factors,Human Factors,We had 6 shipments of dry ice for the flight; ...,,,,A B767-300 Pilot reported his Dangerous Goods ...,2010,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
991883,201201,0601-1200,EGLL.Airport,FO,,,,4000.0,,,...,General None Reported / Taken,Procedure; Company Policy; Human Factors; Manuals,Company Policy,I have seen a lot of mistakes on every flight ...,,,,The Captain of an international flight crew re...,2012,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
1590076,201810,0001-0600,EUG.Airport,OR,,,,3900.0,VMC,,...,Flight Crew Took Evasive Action,Human Factors; Procedure,Ambiguous,It was my first time flying into KEUG and I wa...,,Night VMC visual approach left base leg into E...,,Air carrier flight crew reported receiving a T...,2018,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1715282,202001,1801-2400,MDW.Airport,IL,,5.0,,2000.0,,,...,General None Reported / Taken,Procedure,Procedure,I am writing this report to bring attention to...,,,,Air Carrier First Officer reported that the us...,2020,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [6]:
loaded_data = pd.read_pickle("./data/test_data_final.pkl")

test_df = loaded_data[0]
print("\nA Dataframe with", len(test_df), "entries has been loaded")

# Apply this function to each row in the 'Anomaly' column
test_anomaly_encoding = test_df['Anomaly'].apply(lambda x: check_prefixes(x, anomaly_labels))
test_df['anomaly_encoding'] = test_anomaly_encoding.values.tolist()
test_df.head()



A Dataframe with 10824 entries has been loaded


Unnamed: 0_level_0,Date,Local Time Of Day,Locale Reference,State Reference,Relative Position.Angle.Radial,Relative Position.Distance.Nautical Miles,Altitude.AGL.Single Value,Altitude.MSL.Single Value,Flight Conditions,Weather Elements / Visibility,...,Result,Contributing Factors / Situations,Primary Problem,Narrative,Callback,Narrative.1,Callback.1,Synopsis,Year,anomaly_encoding
ACN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1014798,201206,0601-1200,SLC.Airport,UT,,,,11300.0,VMC,,...,General None Reported / Taken,Aircraft; Human Factors,Aircraft,Flying into SLC on the DELTA THREE RNAV arriva...,The Reporter stated that his aircraft is equip...,,,A CE750 Captain noted that his aircraft's FMS ...,2012,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1806744,202105,1201-1800,ORD.Airport,IL,,,,3900.0,,,...,Flight Crew FLC complied w / Automation / Advi...,Human Factors; Procedure; Airspace Structure; ...,Airspace Structure,ORD was on a very busy east flow arrival push....,,,,C90TRACON Controller reported they did not not...,2021,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1044902,201210,0001-0600,S46.TRACON,WA,,,,,,,...,General None Reported / Taken,ATC Equipment / Nav Facility / Buildings,ATC Equipment / Nav Facility / Buildings,B737-800 was vectored to an ILS Runway 16L app...,,,,S46 Controller expressed concern regarding the...,2012,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1764093,202009,0601-1200,ZZZ.Tower,US,,,400.0,,VMC,,...,Flight Crew Executed Go Around / Missed Approach,Human Factors; Procedure,Human Factors,We were on a 6 mile final when tower cleared a...,,While on about a six mile final tower cleared ...,,CRJ-200 flight crew reported failing to retrac...,2020,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
1786435,202102,1201-1800,ZZZ.ARTCC,US,,,,17000.0,,,...,Air Traffic Control Issued New Clearance; Flig...,Environment - Non Weather Related; Human Facto...,Environment - Non Weather Related,During Climb we Leveled at 17;000 departure sw...,,after copilot (pf) leved at 17000'; dfw depart...,,Air carrier First Officer reported an altitude...,2021,"[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [7]:
# Sections of configBertTokenizer
# Defining some key variables that will be used later on in the training
MAX_LEN = 384
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.narrative = dataframe.Narrative
        self.targets = self.data.anomaly_encoding
        self.max_len = max_len

    def __len__(self):
        return len(self.narrative)

    def __getitem__(self, index):
        narrative = str(self.narrative.iloc[index])
        narrative = " ".join(narrative.split())

        inputs = self.tokenizer(
            narrative,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets.iloc[index], dtype=torch.float)
        }


In [9]:
# Creating the dataset and dataloader for the neural network
print("TRAIN Dataset: {}".format(train_df.shape))
print("TEST Dataset: {}".format(test_df.shape))

training_set = CustomDataset(train_df, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_df, tokenizer, MAX_LEN)

TRAIN Dataset: (97417, 97)
TEST Dataset: (10824, 97)


In [10]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [11]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 15)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [12]:
loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
def train(epoch):
    model.train()
    size = len(training_loader.dataset)
    for batch,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if batch % 1000 == 0:
            current = (batch + 1) * len(targets)
            print(f"Epoch: {epoch}, loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [14]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, loss: 0.667684  [    8/97417]
Epoch: 0, loss: 0.312902  [ 8008/97417]
Epoch: 0, loss: 0.179383  [16008/97417]
Epoch: 0, loss: 0.192420  [24008/97417]
Epoch: 0, loss: 0.232439  [32008/97417]
Epoch: 0, loss: 0.166606  [40008/97417]
Epoch: 0, loss: 0.137630  [48008/97417]
Epoch: 0, loss: 0.127732  [56008/97417]
Epoch: 0, loss: 0.153474  [64008/97417]
Epoch: 0, loss: 0.184303  [72008/97417]
Epoch: 0, loss: 0.117610  [80008/97417]
Epoch: 0, loss: 0.089992  [88008/97417]
Epoch: 0, loss: 0.151542  [96008/97417]


In [15]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [16]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.4300628233555063
F1 Score (Micro) = 0.7901201058560087
F1 Score (Macro) = 0.6190128111792261
