In [1]:
import pandas as pd
import os

### Preprocessing

In [2]:
# target class -> convert the DEP_DELAY to binary: 0-not delayed, 1-delayed
def convert_to_binary(delay):
    if delay <= 0:
        return 0
    else:
        return 1

In [3]:
# data cleaning/reduction/transformation
def clean_data(file_path):
    df = pd.read_csv(file_path)

    df = df[df['CANCELLED'] == 0]
    df.reset_index(drop=True, inplace=True)

    df.drop(['ORIGIN_AIRPORT_SEQ_ID', 'DEST_AIRPORT_SEQ_ID', 'DEP_DEL15', 'WHEELS_OFF', 'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'FLIGHTS', 'DISTANCE_GROUP',
             'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY'], axis=1, inplace=True)

    df['DEP_DELAY'] = df['DEP_DELAY'].apply(convert_to_binary)

    # Get the year from the file name
    year = os.path.basename(file_path).split(".")[0]

    # Save the cleaned DataFrame to a new CSV file
    output_path = os.path.join("data", "preprocessing", f"data_cleaning{year}.csv")
    df.to_csv(output_path, index=False)
    print(f"Data cleaning for {year} is completed.")

In [4]:
directory = "Data"
for file_name in os.listdir(directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory, file_name)
        clean_data(file_path)

Data cleaning for 2020 is completed.
Data cleaning for 2021 is completed.


  df = pd.read_csv(file_path)


Data cleaning for 2022 is completed.
Data cleaning for 2019 is completed.
Data cleaning for 2018 is completed.
Data cleaning for 2017 is completed.


In [5]:
# check for missing values
def check_missing_values(file_path):
    df = pd.read_csv(file_path)
    # Check for missing values in each column
    missing_values = df.isna().sum()
    # Print columns with missing values, if any
    columns_with_missing_values = missing_values[missing_values > 0].index
    if not columns_with_missing_values.empty:
        print(f"File: {file_path}")
        print("Columns with missing values:")
        for column in columns_with_missing_values:
            missing_values_count = missing_values[column]
            print(f"{column}: {missing_values_count} missing value(s)")
        # Remove rows with missing values for any column
        df.dropna(inplace=True)
        df.reset_index(drop=True, inplace=True)
        print(f"Number of rows after removing missing values: {len(df)}")
        print()

        df.to_csv(file_path, index=False)  # Overwrite the original file
    else:
        print(f"File: {file_path} has no missing values in any column.")

# List all CSV files in the directory
directory = "data/preprocessing"
for file_name in os.listdir(directory):
    if file_name.endswith(".csv"):
        file_path = os.path.join(directory, file_name)
        check_missing_values(file_path)


File: data/preprocessing/data_cleaning2017.csv
Columns with missing values:
AIR_TIME: 989 missing value(s)
Number of rows after removing missing values: 457892

File: data/preprocessing/data_cleaning2019.csv
Columns with missing values:
AIR_TIME: 1358 missing value(s)
Number of rows after removing missing values: 618612

File: data/preprocessing/data_cleaning2018.csv
Columns with missing values:
AIR_TIME: 1353 missing value(s)
Number of rows after removing missing values: 585737

File: data/preprocessing/data_cleaning2022.csv
Columns with missing values:
AIR_TIME: 1298 missing value(s)
Number of rows after removing missing values: 525215

File: data/preprocessing/data_cleaning2020.csv
Columns with missing values:
AIR_TIME: 567 missing value(s)
Number of rows after removing missing values: 366940

File: data/preprocessing/data_cleaning2021.csv
Columns with missing values:
AIR_TIME: 1373 missing value(s)
Number of rows after removing missing values: 537183



### KNN

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [9]:
years = range(2017, 2022)
dfs = []
for year in years:
    df = pd.read_csv(f"data/preprocessing/data_cleaning{year}.csv")
    dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True)
X = df.drop(columns=['DEP_DELAY'])
y = df['DEP_DELAY']  # Target variable

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of KNN classifier:", accuracy)

Accuracy of KNN classifier: 0.7782514403790128
