## Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# read
df = pd.read_csv('data/5guys_flight_data_preprocessed.csv')
df.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,TAIL_NUM,ORIGIN,DEST,DEP_DEL15,CRS_ARR_TIME,DISTANCE,CARRIER_NAME,...,PREVIOUS_AIRPORT,AWND,PRCP,TAVG,WDF2,WSF2,WT03,WT08,DEL_ORIGIN_COUNT,WT_FOG
0,1,28,1,182,14,12,1.0,1306,1652.0,15,...,53,12.75,0.53,18.0,300.0,25.9,0.0,1.0,74377.0,1
1,1,28,1,2365,14,3,1.0,1200,888.0,15,...,53,12.75,0.53,18.0,300.0,25.9,0.0,1.0,74377.0,1
2,1,28,1,1481,14,6,1.0,853,925.0,15,...,53,12.75,0.53,18.0,300.0,25.9,0.0,1.0,74377.0,1
3,1,28,1,2401,14,3,0.0,1040,888.0,15,...,53,12.75,0.53,18.0,300.0,25.9,0.0,1.0,74377.0,1
4,1,28,1,2941,14,12,1.0,1435,2072.0,15,...,53,12.75,0.53,18.0,300.0,25.9,0.0,1.0,74377.0,1


## Baseline Performance

Simple Model

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, roc_curve, auc
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler

# Select all columns as features except the target column
features = df.columns.tolist()
features.remove('DEP_DEL15')

target = 'DEP_DEL15'  # You can choose another relevant column for binary classification
df = df.dropna(subset=[target])

# Split the dataset into training and testing sets
X = df[features]
y = df[target]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Perform undersampling on the training data only
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression as a simple model
simple_model_lr = LogisticRegression(random_state=42)
simple_model_lr.fit(X_train_scaled, y_train_resampled)
y_pred_lr = simple_model_lr.predict(X_test_scaled)

# Decision Tree as a simple model
simple_model_dt = DecisionTreeClassifier(random_state=42)
simple_model_dt.fit(X_train_scaled, y_train_resampled)
y_pred_dt = simple_model_dt.predict(X_test_scaled)

# Evaluate Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
confusion_lr = confusion_matrix(y_test, y_pred_lr)
report_lr = classification_report(y_test, y_pred_lr)

# Evaluate Decision Tree
accuracy_dt = accuracy_score(y_test, y_pred_dt)
confusion_dt = confusion_matrix(y_test, y_pred_dt)
report_dt = classification_report(y_test, y_pred_dt)

print('Logistic Regression Metrics:')
print(f'Accuracy: {accuracy_lr}')
print('Confusion Matrix:')
print(confusion_lr)
print('Classification Report:')
print(report_lr)

print('\nDecision Tree Metrics:')
print(f'Accuracy: {accuracy_dt}')
print('Confusion Matrix:')
print(confusion_dt)
print('Classification Report:')
print(report_dt)


Logistic Regression Metrics:
Accuracy: 0.590354212896038
Confusion Matrix:
[[255204 185207]
 [ 37245  65379]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.87      0.58      0.70    440411
         1.0       0.26      0.64      0.37    102624

    accuracy                           0.59    543035
   macro avg       0.57      0.61      0.53    543035
weighted avg       0.76      0.59      0.63    543035


Decision Tree Metrics:
Accuracy: 0.5953759886563481
Confusion Matrix:
[[261990 178421]
 [ 41304  61320]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.86      0.59      0.70    440411
         1.0       0.26      0.60      0.36    102624

    accuracy                           0.60    543035
   macro avg       0.56      0.60      0.53    543035
weighted avg       0.75      0.60      0.64    543035



Random Model(Evaluating Random Baseline)

In [4]:
import random

# Generate random predictions
y_pred_random = [random.choice([0, 1]) for _ in range(len(y_test))]

# Evaluate random baseline
accuracy_random = accuracy_score(y_test, y_pred_random)
confusion_random = confusion_matrix(y_test, y_pred_random)
report_random = classification_report(y_test, y_pred_random)

print('Random Baseline Metrics:')
print(f'Accuracy: {accuracy_random}')
print('Confusion Matrix:')
print(confusion_random)
print('Classification Report:')
print(report_random)


Random Baseline Metrics:
Accuracy: 0.4993121990295285
Confusion Matrix:
[[220088 220323]
 [ 51568  51056]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      0.50      0.62    440411
         1.0       0.19      0.50      0.27    102624

    accuracy                           0.50    543035
   macro avg       0.50      0.50      0.45    543035
weighted avg       0.69      0.50      0.55    543035



Majority Class Baseline

In [5]:
# Majority class baseline
majority_class = y_train_resampled.value_counts().idxmax()
y_pred_majority = [majority_class] * len(y_test)

# Evaluate majority class baseline
accuracy_majority = accuracy_score(y_test, y_pred_majority)
confusion_majority = confusion_matrix(y_test, y_pred_majority)
report_majority = classification_report(y_test, y_pred_majority)

print('Majority Class Baseline Metrics:')
print(f'Accuracy: {accuracy_majority}')
print('Confusion Matrix:')
print(confusion_majority)
print('Classification Report:')
print(report_majority)


  _warn_prf(average, modifier, msg_start, len(result))


Majority Class Baseline Metrics:
Accuracy: 0.8110177060410471
Confusion Matrix:
[[440411      0]
 [102624      0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.81      1.00      0.90    440411
         1.0       0.00      0.00      0.00    102624

    accuracy                           0.81    543035
   macro avg       0.41      0.50      0.45    543035
weighted avg       0.66      0.81      0.73    543035



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
