## 3. Data Processing

### Necessary Imports and Setup

In [16]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Prints the frequency and percentage of values in a column
def printClassBalance(col):   
    class_counts = col.value_counts()
    class_percentages = col.value_counts(normalize=True) * 100
    print(class_counts)
    print(class_percentages)

In [4]:
# Read data
df = pd.read_csv('cleaned_data.csv')

features = ['Operating_Airline ', 'Dest', 'DayOfWeek', 'DayOfMonth']
labels = ['DepDel15']
df_features = df[features]
df_labels = df[labels]

In [5]:
printClassBalance(df_labels)

DepDel15
0           941911
1           106664
Name: count, dtype: int64
DepDel15
0           89.827719
1           10.172281
Name: proportion, dtype: float64


<div class="alert alert-block alert-warning">
<b>Data set is imbalanced: We observe that 89% of the flights are non-delayed and only 11% of flights are delayed.</b>
</div>


### Balancing data set via DownSampling (prune majority classes) 


In [6]:
delayed = df[df['DepDel15'] == 1]
not_delayed = df[df['DepDel15'] == 0]

# Downsample the majority class ('not_delayed') to match the minority class ('delayed')
not_delayed_downsampled = not_delayed.sample(n=len(delayed), random_state=42)

# Concatenate the downsampled majority class with the minority class
df_downsampled = pd.concat([not_delayed_downsampled, delayed])

# Shuffle the resulting DataFrame to randomize the order of rows
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

printClassBalance(df_downsampled[labels])


DepDel15
0           106664
1           106664
Name: count, dtype: int64
DepDel15
0           50.0
1           50.0
Name: proportion, dtype: float64


<div class="alert alert-block alert-warning">
<b>The dataset is now balanced to 50% with 106k entries in delayed and non-delayed by pruning the non-delayed flights.</b>
</div>


### Balancing data set via RandomOversampling (duplicates minority classes)

In [7]:
ros = RandomOverSampler(random_state=1)
features_set, labels_set = ros.fit_resample(df_features, df_labels)

DepDel15
0           941911
1           941911
Name: count, dtype: int64
DepDel15
0           50.0
1           50.0
Name: proportion, dtype: float64


<div class="alert alert-block alert-warning">
<b>The dataset is now balanced to 50% with 941k entries in delayed and non-delayed by duplcating the delayed flights.</b>
</div>


### One-Hot Encoding Categorical Variables

In [49]:
# Define fake data
data = {
    'Destination Airport': ['JFK', 'LAX', 'ORD', 'DFW', 'SFO'],
    'Distance': [2000, 2500, 1800, 1500, 2800],
    'Delayed Flight (Y/N)': np.random.choice(['Y', 'N'], size=5)  # Generating random Y/N values
}

# Create DataFrame
df = pd.DataFrame(data)


f = ['Destination Airport', 'Distance']
c = ['Destination Airport']
l = ['Delayed Flight (Y/N)']

df_f = df[f]
df_categorical = df[c]
df_l = df[l]


combined = pd.concat([df_f, df_l], axis=1)
ohe = pd.get_dummies(df_categorical)
df_c = pd.concat([combined, ohe], axis=1).drop(columns=c)
print(df_c)

   Distance Delayed Flight (Y/N)  Destination Airport_DFW  \
0      2000                    Y                    False   
1      2500                    N                    False   
2      1800                    N                    False   
3      1500                    N                     True   
4      2800                    Y                    False   

   Destination Airport_JFK  Destination Airport_LAX  Destination Airport_ORD  \
0                     True                    False                    False   
1                    False                     True                    False   
2                    False                    False                     True   
3                    False                    False                    False   
4                    False                    False                    False   

   Destination Airport_SFO  
0                    False  
1                    False  
2                    False  
3                    False  
4      

In [13]:







print(df_features.shape)
print(df_labels.shape)
# One-hot encode 'category' column
one_hot_encoded = pd.get_dummies(features_set)

# Concatenate one-hot encoded features with original DataFrame
df_combined = pd.concat([df[features + labels], one_hot_encoded], axis=1).drop(columns=features)

print(df_combined.shape)

(1048575, 4)
(1048575, 1)
(1883822, 392)


## Exporting CSV

In [9]:
df_combined.to_csv('processed_data_upsampled.csv', index=False)