<a href="https://colab.research.google.com/github/NedKost/MS-AAI-501-Team-Project/blob/main/Neural_NetFlight.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

file_path = '/Users/anovayoungers/Downloads/flight_data.csv'
data = pd.read_csv(file_path)


In [None]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17412876 entries, 0 to 17412875
Data columns (total 37 columns):
 #   Column                           Dtype  
---  ------                           -----  
 0   Year                             int64  
 1   Quarter                          int64  
 2   Month                            int64  
 3   DayofMonth                       int64  
 4   DayOfWeek                        int64  
 5   FlightDate                       object 
 6   Reporting_Airline                object 
 7   Tail_Number                      object 
 8   Flight_Number_Reporting_Airline  int64  
 9   Origin                           object 
 10  Dest                             object 
 11  DepTime                          float64
 12  DepDelay                         float64
 13  TaxiOut                          float64
 14  WheelsOff                        float64
 15  WheelsOn                         float64
 16  TaxiIn                           float64
 17  CRSArr

In [None]:
# Number of unique values in the 'Origin' column
unique_origins = data['Origin'].nunique()
print("Number of unique origin airports:", unique_origins)

# Number of unique values in the 'Dest' column
unique_destinations = data['Dest'].nunique()
print("Number of unique destination airports:", unique_destinations)


Number of unique origin airports: 378
Number of unique destination airports: 379


In [None]:
# Top ten origin and destination airports
top_ten_origins = data['Origin'].value_counts().head(10).index
top_ten_destinations = data['Dest'].value_counts().head(10).index

# Categorize all other airports as 'Other' in new columns
data['Top_Origin'] = data['Origin'].apply(lambda x: x if x in top_ten_origins else 'Other')
data['Top_Dest'] = data['Dest'].apply(lambda x: x if x in top_ten_destinations else 'Other')

# Checking the distribution of the new columns
print("Top Origin Airports Distribution:\n", data['Top_Origin'].value_counts())
print("\nTop Destination Airports Distribution:\n", data['Top_Dest'].value_counts())


Top Origin Airports Distribution:
 Other    11570431
ATL        886549
DFW        782947
DEN        731279
ORD        691163
CLT        564569
LAX        489642
PHX        448145
SEA        443246
LAS        429452
MCO        375453
Name: Top_Origin, dtype: int64

Top Destination Airports Distribution:
 Other    11570832
ATL        886484
DFW        782869
DEN        731189
ORD        691113
CLT        564520
LAX        489595
PHX        448084
SEA        443211
LAS        429519
MCO        375460
Name: Top_Dest, dtype: int64


In [None]:
# Exclude rows where 'Top_Origin' or 'Top_Dest' is 'Other'
filtered_data = data[(data['Top_Origin'] != 'Other') & (data['Top_Dest'] != 'Other')]

# Checking distribution
print("Filtered Top Origin Airports Distribution:\n", filtered_data['Top_Origin'].value_counts())
print("\nFiltered Top Destination Airports Distribution:\n", filtered_data['Top_Dest'].value_counts())


Filtered Top Origin Airports Distribution:
 LAX    145590
DEN    145335
ORD    127780
LAS    127311
ATL    127034
DFW    121895
PHX    117274
SEA    108540
MCO     86840
CLT     78146
Name: Top_Origin, dtype: int64

Filtered Top Destination Airports Distribution:
 LAX    145672
DEN    145207
ORD    127583
LAS    127282
ATL    127016
DFW    121917
PHX    117337
SEA    108586
MCO     86923
CLT     78222
Name: Top_Dest, dtype: int64


In [None]:
print(data['ArrDel15'].tail())


17412871    0.0
17412872    0.0
17412873    0.0
17412874    0.0
17412875    NaN
Name: ArrDel15, dtype: float64


In [None]:
# List of features to include, plus the target variable
features = ['Top_Origin', 'Top_Dest','DepTime', 'DepDelay', 'Distance', 'ArrTime', 'CarrierDelay',
            'WeatherDelay', 'Full-time', 'Part-time', 'SecurityDelay', 'ArrDel15']

# Filter the dataset to include only the selected features and target variable
selected_data = data[features]

# Drop rows with NaN values in any of the selected columns
selected_data_cleaned = selected_data.dropna()

# Print the shape of the cleaned dataset
print("Cleaned Dataset Shape:", selected_data_cleaned.shape)


Cleaned Dataset Shape: (2813734, 12)


In [None]:
# One-hot encode 'Top_Origin' and 'Top_Dest'
one_hot_origin = pd.get_dummies(selected_data_cleaned['Top_Origin'], prefix='Origin')
one_hot_dest = pd.get_dummies(selected_data_cleaned['Top_Dest'], prefix='Dest')

# Concatenate the one-hot encoded columns back to the DataFrame
selected_data_encoded = pd.concat([selected_data_cleaned, one_hot_origin, one_hot_dest], axis=1)

# Drop the original 'Top_Origin' and 'Top_Dest' columns
selected_data_encoded.drop(['Top_Origin', 'Top_Dest'], axis=1, inplace=True)

# Check the first few rows of the dataset
print(selected_data_encoded.head())


    DepTime  DepDelay  Distance  ArrTime  CarrierDelay  WeatherDelay  \
14   2019.0      39.0    1133.0   2251.0           2.0           0.0   
23    854.0      -6.0     728.0   1201.0           0.0           0.0   
44   1311.0      56.0     728.0   1517.0          54.0           0.0   
79   1448.0      -7.0    1121.0   1917.0           0.0           0.0   
85   1542.0      57.0    1121.0   1841.0          31.0           0.0   

    Full-time  Part-time  SecurityDelay  ArrDel15  ...  Dest_CLT  Dest_DEN  \
14    16738.0     4860.0            0.0       1.0  ...         0         0   
23    16738.0     4860.0            0.0       1.0  ...         1         0   
44    16738.0     4860.0            0.0       1.0  ...         0         0   
79    16738.0     4860.0            0.0       1.0  ...         0         0   
85    16738.0     4860.0            0.0       1.0  ...         0         0   

    Dest_DFW  Dest_LAS  Dest_LAX  Dest_MCO  Dest_ORD  Dest_Other  Dest_PHX  \
14         0        

In [None]:
print("Encoded Dataset Shape:", selected_data_encoded.shape)

Encoded Dataset Shape: (2813734, 32)


In [None]:
from sklearn.model_selection import train_test_split

X = selected_data_encoded.drop('ArrDel15', axis=1)
y = selected_data_encoded['ArrDel15']

# Splitting the dataset into train, validation, and test sets (70%, 15%, 15%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
# Display data types of each column
print("Data Types:\n", X_train.dtypes)

# Display unique values count for numerical columns
print("\nUnique Values Count for Numerical Columns:")
for col in X_train.columns:
    if X_train[col].dtype != 'object':
        print(f"{col}: {X_train[col].nunique()} unique values")

# Descriptive statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(X_train.describe())


Data Types:
 DepTime          float64
DepDelay         float64
Distance         float64
ArrTime          float64
CarrierDelay     float64
WeatherDelay     float64
Full-time        float64
Part-time        float64
SecurityDelay    float64
Origin_ATL         uint8
Origin_CLT         uint8
Origin_DEN         uint8
Origin_DFW         uint8
Origin_LAS         uint8
Origin_LAX         uint8
Origin_MCO         uint8
Origin_ORD         uint8
Origin_Other       uint8
Origin_PHX         uint8
Origin_SEA         uint8
Dest_ATL           uint8
Dest_CLT           uint8
Dest_DEN           uint8
Dest_DFW           uint8
Dest_LAS           uint8
Dest_LAX           uint8
Dest_MCO           uint8
Dest_ORD           uint8
Dest_Other         uint8
Dest_PHX           uint8
Dest_SEA           uint8
dtype: object

Unique Values Count for Numerical Columns:
DepTime: 1440 unique values
DepDelay: 1802 unique values
Distance: 1680 unique values
ArrTime: 1440 unique values
CarrierDelay: 1667 unique values
Weather

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Columns to scale
columns_to_scale = ['DepTime', 'DepDelay', 'Distance', 'ArrTime',
                    'CarrierDelay', 'WeatherDelay', 'Full-time',
                    'Part-time', 'SecurityDelay']

# Apply the scaler to the training set
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Apply the same scaler to the validation and test sets
X_validation[columns_to_scale] = scaler.transform(X_validation[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Determine the number of input features
n_features = X_train.shape[1]

# Define the model
model = Sequential()
model.add(Dense(10, activation='relu', input_shape=(n_features,)))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                320       
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 331 (1.29 KB)
Trainable params: 331 (1.29 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_validation, y_validation))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

n_features = X_train.shape[1]

model = Sequential()

# First hidden layer with L2 regularization
model.add(Dense(10, activation='relu', input_shape=(n_features,), kernel_regularizer=l2(0.001)))

# Dropout layer
model.add(Dropout(0.5))

# Second hidden layer
model.add(Dense(10, activation='relu', kernel_regularizer=l2(0.001)))

# Another dropout layer
model.add(Dropout(0.5))

# Output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 10)                320       
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_3 (Dense)             (None, 10)                110       
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 11        
                                                                 
Total params: 441 (1.72 KB)
Trainable params: 441 (1.72 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(X_train, y_train, epochs=2, batch_size=32, validation_data=(X_validation, y_validation))

Epoch 1/2
Epoch 2/2


In [None]:
file_path = '/Users/anovayoungers/Downloads/flight_data_weather.csv'

weather_data = pd.read_csv(file_path)

print(weather_data.head())

print(weather_data.info())


     Year  Quarter  Month  DayofMonth  DayOfWeek  FlightDate  \
0  2020.0      1.0    1.0         1.0        3.0  2020-01-01   
1  2020.0      1.0    1.0         1.0        3.0  2020-01-01   
2  2020.0      1.0    1.0         1.0        3.0  2020-01-01   
3  2020.0      1.0    1.0         1.0        3.0  2020-01-01   
4  2020.0      1.0    1.0         1.0        3.0  2020-01-01   

  Reporting_Airline Tail_Number  Flight_Number_Reporting_Airline Origin  ...  \
0                AA      N407AN                            664.0    KOA  ...   
1                9E      N297PQ                           5270.0    LFT  ...   
2                WN      N963WN                           1395.0    CMH  ...   
3                AA      N891NN                            137.0    TUS  ...   
4                AA      N807AW                           2189.0    SAT  ...   

  Full-time  Part-time  Grand Total  Origin_Windspeed  Origin_Precip  \
0   95612.0    11840.0     107452.0               3.0         

In [None]:
# Selecting the relevant columns
columns_to_use = ['Origin', 'Dest', 'DepTime', 'DepDelay', 'TaxiOut', 'WheelsOff',
                'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDel15',
                'CRSElapsedTime', 'ActualElapsedTime', 'AirTime', 'Distance',
                'Carrier', 'Full-time', 'Part-time', 'Origin_Windspeed',
                'Origin_Precip', 'Dest_Windspeed', 'Dest_Precip']

# Copy the dataset to avoid SettingWithCopyWarning
selected_data = weather_data[columns_to_use].copy()

# Top ten origin and destination airports
top_ten_origins = selected_data['Origin'].value_counts().head(10).index
top_ten_destinations = selected_data['Dest'].value_counts().head(10).index

# Categorize all other airports as 'Other' using .loc
selected_data.loc[:, 'Top_Origin'] = selected_data['Origin'].apply(lambda x: x if x in top_ten_origins else 'Other')
selected_data.loc[:, 'Top_Dest'] = selected_data['Dest'].apply(lambda x: x if x in top_ten_destinations else 'Other')

# One-hot encode the top origin and destination airports with unique labels
one_hot_origin = pd.get_dummies(selected_data['Top_Origin'], prefix='Weather_Origin')
one_hot_dest = pd.get_dummies(selected_data['Top_Dest'], prefix='Weather_Dest')

# Concatenate the one-hot encoded columns back to the DataFrame
selected_data_encoded = pd.concat([selected_data, one_hot_origin, one_hot_dest], axis=1)

# Drop the original and 'Other' categories from 'Top_Origin' and 'Top_Dest'
selected_data_encoded.drop(['Origin', 'Dest', 'Top_Origin', 'Top_Dest', 'Weather_Origin_Other', 'Weather_Dest_Other'], axis=1, inplace=True)

# Separate features and target variable
X = selected_data_encoded.drop('ArrDel15', axis=1)
y = selected_data_encoded['ArrDel15']

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into train and temporary sets (70% train, 30% temp)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Splitting the temporary set into validation and test sets (each 15% of the total data)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
# Print unique value counts for numerical columns
print("Unique Values in Numerical Columns:")
for col in X_train.columns:
    if X_train[col].dtype != 'object' and X_train[col].nunique() > 10:
        print(f"{col}: {X_train[col].nunique()} unique values")

# Print descriptive statistics for numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(X_train.describe())



Unique Values in Numerical Columns:
DepTime: 1363 unique values
DepDelay: 1076 unique values
TaxiOut: 165 unique values
WheelsOff: 1363 unique values
WheelsOn: 1437 unique values
TaxiIn: 152 unique values
CRSArrTime: 1320 unique values
ArrTime: 1432 unique values
CRSElapsedTime: 535 unique values
ActualElapsedTime: 574 unique values
AirTime: 554 unique values
Distance: 1547 unique values
Full-time: 575 unique values
Part-time: 478 unique values
Origin_Windspeed: 42 unique values
Origin_Precip: 122 unique values
Dest_Windspeed: 46 unique values
Dest_Precip: 127 unique values

Descriptive Statistics for Numerical Columns:
             DepTime       DepDelay        TaxiOut      WheelsOff  \
count  240184.000000  240184.000000  240184.000000  240184.000000   
mean     1386.751732      28.845302      17.971459    1410.157858   
std       492.749936      74.977514      11.945688     495.322629   
min         1.000000     -60.000000       1.000000       1.000000   
25%      1003.000000      -

In [None]:
# Drop the 'Carrier' column from the training set
X_train = X_train.drop('Carrier', axis=1)

# Drop the 'Carrier' column from the validation set
X_validation = X_validation.drop('Carrier', axis=1)

# Drop the 'Carrier' column from the test set
X_test = X_test.drop('Carrier', axis=1)


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Columns to scale
columns_to_scale = ['DepTime', 'DepDelay', 'TaxiOut', 'WheelsOff', 'WheelsOn',
                    'TaxiIn', 'CRSArrTime', 'ArrTime',
                    'CRSElapsedTime', 'ActualElapsedTime',
                    'AirTime', 'Distance', 'Full-time', 'Part-time',
                    'Origin_Windspeed', 'Origin_Precip', 'Dest_Windspeed',
                    'Dest_Precip']

# Apply the scaler to the training set
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])

# Apply the scaler to the validation and test sets
X_validation[columns_to_scale] = scaler.transform(X_validation[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])


In [None]:
# Check for consistency in column numbers
print("Number of columns in Training set:", X_train.shape[1])
print("Number of columns in Validation set:", X_validation.shape[1])
print("Number of columns in Test set:", X_test.shape[1])

# Inspect the first few rows
print("\nFirst few rows of the Training set:")
print(X_train.head())

# Summary statistics of the training set (for scaled columns)
print("\nSummary Statistics of the Training set (for scaled columns):")
print(X_train.describe())

# Check data types
print("\nData types of the Training set columns:")
print(X_train.dtypes)

# Check the target variable format
print("\nFirst few values of the target variable in Training set:")
print(y_train.head())


Number of columns in Training set: 38
Number of columns in Validation set: 38
Number of columns in Test set: 38

First few rows of the Training set:
         DepTime  DepDelay   TaxiOut  WheelsOff  WheelsOn    TaxiIn  \
104854  1.352105 -0.424732 -0.499885   1.402810  1.300012 -0.519013   
55290   0.142564 -0.024611 -0.499885   0.199551  0.410600  1.831695   
147232 -0.082703 -0.438070 -0.165036  -0.016470  1.112865 -0.519013   
231940 -1.184684  0.455533  0.420951  -1.179351 -0.914253  1.048126   
73572  -0.715885 -0.211334  0.672088  -0.626175 -0.541812 -0.649608   

        CRSArrTime   ArrTime  CRSElapsedTime  ActualElapsedTime  ...  \
104854    1.382295  1.287247        0.085568          -0.028519  ...   
55290     0.367208  0.441516        0.378593           0.458618  ...   
147232    1.229832  1.102357        2.283250           1.776754  ...   
231940   -1.245699 -0.878338       -0.119549           0.071774  ...   
73572    -0.669947 -0.534187       -0.617690          -0.544311 

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Number of input features
n_features = X_train.shape[1]

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(n_features,)))  # First hidden layer
model.add(Dense(64, activation='relu'))  # Second hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 128)               4992      
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 13313 (52.00 KB)
Trainable params: 13313 (52.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Initialize EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with EarlyStopping
history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_validation, y_validation),
    callbacks=[early_stopping]
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Overfitting central!

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2

# Redefine the model with dropout and regularization
model = Sequential()

# First hidden layer with L1_L2 regularization and dropout
model.add(Dense(128, activation='relu', input_shape=(n_features,), kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.3))  # Adjusted dropout

# Second hidden layer with regularization and dropout
model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.3))

# Output layer remains the same
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 128)               4992      
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 13313 (52.00 KB)
Trainable params: 13313 (52.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Initialize EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_validation, y_validation),
    callbacks=[early_stopping]
)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss: {:.4f}, Test Accuracy: {:.2f}%".format(test_loss, test_accuracy * 100))


Test Loss: 0.1575, Test Accuracy: 98.35%


The model performs well on unseen data.