In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.model_selection import train_test_split

# Prepare data

In [None]:
flights = pd.read_excel("/content/drive/MyDrive/Kho dữ liệu và OLAP/US_flights_2023_500k.xlsx")
# flights = flights.head(50000)

In [None]:
dep_airports = pd.read_excel("/content/drive/MyDrive/Kho dữ liệu và OLAP/airports_geolocation.xlsx")
arr_airports = pd.read_excel("/content/drive/MyDrive/Kho dữ liệu và OLAP/airports_geolocation.xlsx")

In [None]:
weather = pd.read_excel("/content/drive/MyDrive/Kho dữ liệu và OLAP/weather_meteo_by_airport.xlsx")

In [None]:
flights.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Flight_Duration', 'Distance_type', 'Delay_Carrier',
       'Delay_Weather', 'Delay_NAS', 'Delay_Security', 'Delay_LastAircraft',
       'Manufacturer', 'Model', 'Aicraft_age'],
      dtype='object')

In [None]:
dep_airports.columns

Index(['IATA_CODE', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY', 'LATITUDE',
       'LONGITUDE'],
      dtype='object')

In [None]:
sorted_df = flights.sort_values(by=['Dep_Airport', 'Arr_Airport'])
sorted_df[['Dep_Airport', 'Arr_Airport', 'Distance_type']]

Unnamed: 0,Dep_Airport,Arr_Airport,Distance_type
11539,ABE,ATL,Short Haul >1500Mi
28688,ABE,ATL,Short Haul >1500Mi
30661,ABE,ATL,Short Haul >1500Mi
32758,ABE,ATL,Short Haul >1500Mi
33549,ABE,ATL,Short Haul >1500Mi
...,...,...,...
465551,YUM,PHX,Short Haul >1500Mi
467363,YUM,PHX,Short Haul >1500Mi
473294,YUM,PHX,Short Haul >1500Mi
476324,YUM,PHX,Short Haul >1500Mi


## Merge aiport info

In [None]:
merged_dep_airport = pd.merge(flights, dep_airports, left_on=['Dep_Airport'], right_on=['IATA_CODE'], how='inner')
merged_dep_airport.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Flight_Duration', 'Distance_type', 'Delay_Carrier',
       'Delay_Weather', 'Delay_NAS', 'Delay_Security', 'Delay_LastAircraft',
       'Manufacturer', 'Model', 'Aicraft_age', 'IATA_CODE', 'AIRPORT', 'CITY',
       'STATE', 'COUNTRY', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [None]:
merged_dep_airport = merged_dep_airport.drop(['Flight_Duration', 'Delay_Carrier','Delay_Weather', 'Delay_NAS', 'Delay_Security', 'Delay_LastAircraft',\
                         'IATA_CODE', 'AIRPORT'], axis=1)
col_mapping = {
    'CITY': 'dep_city',
    'STATE': 'dep_state',
    'COUNTRY': 'dep_country',
    'LATITUDE': 'dep_latitude',
    'LONGITUDE': 'dep_longitude'
}

merged_dep_airport = merged_dep_airport.rename(columns=col_mapping)

In [None]:
merged_dep_airport.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude'],
      dtype='object')

In [None]:
merged_arr_airport = pd.merge(merged_dep_airport, arr_airports, left_on=['Arr_Airport'], right_on=['IATA_CODE'], how='inner')
merged_arr_airport.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude', 'IATA_CODE', 'AIRPORT', 'CITY', 'STATE', 'COUNTRY',
       'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [None]:
merged_arr_airport = merged_arr_airport.drop(['IATA_CODE', 'AIRPORT'], axis=1)
col_mapping = {
    'CITY': 'arr_city',
    'STATE': 'arr_state',
    'COUNTRY': 'arr_country',
    'LATITUDE': 'arr_latitude',
    'LONGITUDE': 'arr_longitude'
}
merged_arr_airport = merged_arr_airport.rename(columns=col_mapping)
merged_arr_airport.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude', 'arr_city', 'arr_state', 'arr_country', 'arr_latitude',
       'arr_longitude'],
      dtype='object')

## Merge weather info

In [None]:
weather.columns

Index(['time', 'tavg', 'tmin', 'tmax', 'prcp', 'snow', 'wdir', 'wspd', 'pres',
       'airport_id'],
      dtype='object')

In [None]:
merged_dep_weather = pd.merge(merged_arr_airport, weather, left_on=['FlightDate', 'Dep_Airport'], right_on=['time', 'airport_id'], how='inner')

In [None]:
merged_dep_weather = merged_dep_weather.drop(['time', 'airport_id'], axis=1)
col_mapping = {
    'tavg': 'dep_tavg',
    'tmin': 'dep_tmin',
    'tmax': 'dep_tmax',
    'prcp': 'dep_prcp',
    'snow': 'dep_snow',
    'wdir': 'dep_wdir',
    'wspd': 'dep_wspd',
    'pres': 'dep_pres',
}
merged_dep_weather = merged_dep_weather.rename(columns=col_mapping)
merged_dep_weather.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude', 'arr_city', 'arr_state', 'arr_country', 'arr_latitude',
       'arr_longitude', 'dep_tavg', 'dep_tmin', 'dep_tmax', 'dep_prcp',
       'dep_snow', 'dep_wdir', 'dep_wspd', 'dep_pres'],
      dtype='object')

In [None]:
merged_arr_weather = pd.merge(merged_dep_weather, weather, left_on=['FlightDate', 'Arr_Airport'], right_on=['time', 'airport_id'], how='inner')

In [None]:
merged_arr_weather = merged_arr_weather.drop(['time', 'airport_id'], axis=1)
col_mapping = {
    'tavg': 'arr_tavg',
    'tmin': 'arr_tmin',
    'tmax': 'arr_tmax',
    'prcp': 'arr_prcp',
    'snow': 'arr_snow',
    'wdir': 'arr_wdir',
    'wspd': 'arr_wspd',
    'pres': 'arr_pres',
}
merged_arr_weather = merged_arr_weather.rename(columns=col_mapping)
merged_arr_weather.columns

Index(['FlightDate', 'Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport',
       'Dep_CityName', 'DepTime_label', 'Dep_Delay', 'Dep_Delay_Tag',
       'Dep_Delay_Type', 'Arr_Airport', 'Arr_CityName', 'Arr_Delay',
       'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude', 'arr_city', 'arr_state', 'arr_country', 'arr_latitude',
       'arr_longitude', 'dep_tavg', 'dep_tmin', 'dep_tmax', 'dep_prcp',
       'dep_snow', 'dep_wdir', 'dep_wspd', 'dep_pres', 'arr_tavg', 'arr_tmin',
       'arr_tmax', 'arr_prcp', 'arr_snow', 'arr_wdir', 'arr_wspd', 'arr_pres'],
      dtype='object')

## Extract day, month, year

In [None]:
# Splitting the date string and converting to integers
merged_arr_weather['Year'] = merged_arr_weather['FlightDate'].str.split('-').str[0].astype(int)
merged_arr_weather['Month'] = merged_arr_weather['FlightDate'].str.split('-').str[1].astype(int)
merged_arr_weather['Day'] = merged_arr_weather['FlightDate'].str.split('-').str[2].astype(int)

merged_arr_weather.drop(['FlightDate'], axis=1, inplace=True)

In [None]:
merged_arr_weather.drop(['Dep_CityName', 'Arr_CityName'], axis=1, inplace=True)

In [None]:
merged_arr_weather.columns

Index(['Day_Of_Week', 'Airline', 'Tail_Number', 'Dep_Airport', 'DepTime_label',
       'Dep_Delay', 'Dep_Delay_Tag', 'Dep_Delay_Type', 'Arr_Airport',
       'Arr_Delay', 'Arr_Delay_Type', 'Distance_type', 'Manufacturer', 'Model',
       'Aicraft_age', 'dep_city', 'dep_state', 'dep_country', 'dep_latitude',
       'dep_longitude', 'arr_city', 'arr_state', 'arr_country', 'arr_latitude',
       'arr_longitude', 'dep_tavg', 'dep_tmin', 'dep_tmax', 'dep_prcp',
       'dep_snow', 'dep_wdir', 'dep_wspd', 'dep_pres', 'arr_tavg', 'arr_tmin',
       'arr_tmax', 'arr_prcp', 'arr_snow', 'arr_wdir', 'arr_wspd', 'arr_pres',
       'Year', 'Month', 'Day'],
      dtype='object')

# Classification

In [None]:
!pip install catboost -qq
from catboost import CatBoostClassifier

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
data = merged_arr_weather.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 44 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Day_Of_Week     500000 non-null  int64  
 1   Airline         500000 non-null  object 
 2   Tail_Number     500000 non-null  object 
 3   Dep_Airport     500000 non-null  object 
 4   DepTime_label   500000 non-null  object 
 5   Dep_Delay       500000 non-null  int64  
 6   Dep_Delay_Tag   500000 non-null  int64  
 7   Dep_Delay_Type  500000 non-null  object 
 8   Arr_Airport     500000 non-null  object 
 9   Arr_Delay       500000 non-null  int64  
 10  Arr_Delay_Type  500000 non-null  object 
 11  Distance_type   500000 non-null  object 
 12  Manufacturer    500000 non-null  object 
 13  Model           500000 non-null  object 
 14  Aicraft_age     500000 non-null  int64  
 15  dep_city        500000 non-null  object 
 16  dep_state       500000 non-null  object 
 17  dep_countr

In [None]:
column_names = data.columns
categorical_variables = {}

# Iterate through columns
for col in column_names:
    # Check if the column has categorical data
    if pd.api.types.is_categorical_dtype(data[col]):
        # If categorical, store unique values
        categorical_variables[col] = data[col].cat.categories.tolist()
    else:
        # If not categorical, store unique values with quotes around 'ad_projectid'
        values = data[col].unique().tolist()
        if col == 'ad_projectid':
            values = [f'{value}' for value in values]
        categorical_variables[col] = values

# Print the formatted output
print("categorical_variables = {")
for col, values in categorical_variables.items():
    print(f'    "{col}": {values},')
print("}")

categorical_variables = {
    "Day_Of_Week": [4, 5, 6, 1, 2, 7, 3],
    "Airline": ['Delta Air Lines Inc', 'Alaska Airlines Inc.', 'Southwest Airlines Co.', 'Endeavor Air', 'Spirit Air Lines', 'United Air Lines Inc.', 'Skywest Airlines Inc.', 'American Eagle Airlines Inc.', 'American Airlines Inc.', 'JetBlue Airways', 'Hawaiian Airlines Inc.', 'Allegiant Air', 'Frontier Airlines Inc.', 'PSA Airlines', 'Republic Airways'],
    "Tail_Number": ['N329NW', 'N263AK', 'N224WN', 'N7746C', 'N971AT', 'N944WN', 'N304PQ', 'N924AT', 'N328DN', 'N681NK', 'N593NW', 'N112DU', 'N37409', 'N309SY', 'N358DN', 'N388DN', 'N830DN', 'N585NW', 'N273NN', 'N333NB', 'N365NW', 'N368NB', 'N315NB', 'N907XJ', 'N712EV', 'N181GJ', 'N374NW', 'N247SY', 'N341NW', 'N136EV', 'N546AS', 'N505SY', 'N38454', 'N8553W', 'N8631A', 'N653NK', 'N67812', 'N68452', 'N339DN', 'N135NN', 'N526DE', 'N111ZM', 'N296SY', 'N755US', 'N936AN', 'N360DN', 'N3733Z', 'N510DE', 'N556JB', 'N188US', 'N8725L', 'N118NN', 'N3773D', 'N3751B', 'N481WN', 'N78

In [None]:
df1 = data.copy()

In [None]:
label_encoder = LabelEncoder()
df1['Dep_Delay_Type'] = label_encoder.fit_transform(df1['Dep_Delay_Type'])

In [None]:
# Separate the features (X) and the target (y)
X = df1.drop(columns=['Dep_Delay', 'Dep_Delay_Tag', 'Dep_Delay_Type', 'Arr_Delay',\
                      'Arr_Delay_Type'])
y = df1['Dep_Delay_Tag']

# Split the data into a test subset (smaller portion) and a training subset (larger portion)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
numerical_columns = X.select_dtypes(exclude = 'object').columns
categorical_columns = X_train.select_dtypes(include = 'object').columns
categorical_column_indices = [X_train.columns.get_loc(col) for col in categorical_columns]

In [None]:
# Convert NaN values to strings in the specified categorical columns
for col_index in categorical_column_indices:
    X_train.iloc[:, col_index] = X_train.iloc[:, col_index].astype(str).fillna('NaN')
    X_test.iloc[:, col_index] = X_test.iloc[:, col_index].astype(str).fillna('NaN')

In [None]:
# Initialize an empty dataframe to store results
results_df = pd.DataFrame(columns=['Model', 'Classification_Report'])

# Additional classifiers for models_with_categorical
models_with_categorical = [
    DecisionTreeClassifier(),
    RandomForestClassifier(max_depth=10),
    GradientBoostingClassifier(max_depth=10),
    XGBClassifier(max_depth=10),
    # SVC(),
    CatBoostClassifier(cat_features=categorical_column_indices)
]

# Additional classifiers for models_without_categorical
models_without_categorical = [
    LogisticRegression(),
    RidgeClassifier(),
    Lasso(),
    KNeighborsClassifier()
]

In [None]:
# for model in models_without_categorical:
#     # Fit the model
#     model.fit(X_train[numerical_columns].fillna(X[numerical_columns].mean()), y_train)

#     # Predict on the test set
#     if type(model).__name__=='Lasso':
#         y_pred = model.predict(X_test[numerical_columns].fillna(X[numerical_columns].mean()))
#         y_pred = (y_pred >= 0.5).astype(int)  # Round probabilities to 0 or 1
#     else:
#         y_pred = model.predict(X_test[numerical_columns].fillna(X[numerical_columns].mean()))

#     # Generate classification report
#     clf_report = classification_report(y_test, y_pred)
#     print('Model:', type(model).__name__)
#     print(clf_report)

In [None]:
# Encode input
X_encoded = X.copy()
for feature in categorical_columns:
    label_encoder = LabelEncoder()
    X_encoded[feature] = label_encoder.fit_transform(X_encoded[feature])

X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

# Apply SMOTE to the training data
X_train_encoded, y_train = adasyn.fit_resample(X_train_encoded, y_train)

In [None]:
y_train.value_counts()

Dep_Delay_Tag
1    253774
0    247961
Name: count, dtype: int64

In [None]:
for model in models_with_categorical:
    # Fit the model
    model.fit(X_train_encoded, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test_encoded)

    # Generate classification report
    classification_rep = classification_report(y_test, y_pred)

    # Append results to the dataframe
    print('Model:', type(model).__name__)
    print(classification_rep)

    # Make predictions on the training set
    y_train_pred = model.predict(X_train_encoded)

    # Print the classification report
    print("Classification Report on Train Set:")
    print(classification_report(y_train, y_train_pred))

Model: DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.68      0.64      0.66     62245
           1       0.46      0.51      0.48     37755

    accuracy                           0.59    100000
   macro avg       0.57      0.58      0.57    100000
weighted avg       0.60      0.59      0.60    100000

Classification Report on Train Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    247961
           1       1.00      1.00      1.00    253774

    accuracy                           1.00    501735
   macro avg       1.00      1.00      1.00    501735
weighted avg       1.00      1.00      1.00    501735

Model: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.73      0.71      0.72     62245
           1       0.54      0.56      0.55     37755

    accuracy                           0.65    100000
   macro avg       0.63      0.6

KeyboardInterrupt: 