In [5]:
# Data manipulation and visualization
import pandas as pd
import numpy as np

# Model selection and evaluation
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

# Regression models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor


# Pipeline
from sklearn.pipeline import Pipeline

# Model persistence
import joblib

In [6]:
pd.options.mode.copy_on_write = True
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [7]:
df = pd.read_csv("../sources/data/flights_cleaned.csv").sample(1000000, random_state = 42)
airports_df = pd.read_csv("../sources/data/airports.csv")
airlines_df = pd.read_csv("../sources/data/airlines.csv")

In [8]:
df.head()

Unnamed: 0,Year,Month,Day,Day_Of_Week,Airline,Flight_Number,Origin_Airport,Destination_Airport,Scheduled_Departure,Departure_Time,Departure_Delay,Taxi_Out,Wheels_Off,Scheduled_Time,Elapsed_Time,Air_Time,Distance,Wheels_On,Taxi_In,Scheduled_Arrival,Arrival_Time,Arrival_Delay,Diverted,Cancelled,Cancellation_Reason,Air_System_Delay,Security_Delay,Airline_Delay,Late_Aircraft_Delay,Weather_Delay,Date
1742449,2015,4,26,7,OO,4753,ONT,SLC,10:05,09:55,-10.0,8.0,10:03,105.0,101.0,88.0,558,12:31,5.0,12:50,12:36,-14.0,0,0,,,,,,,2015-04-26
4266575,2015,10,3,6,WN,3100,ATL,BWI,14:05,14:08,3.0,10.0,14:18,105.0,116.0,81.0,577,15:39,25.0,15:50,16:04,14.0,0,0,,,,,,,2015-10-03
4209714,2015,9,29,2,DL,1700,BOS,ATL,19:15,19:13,-2.0,27.0,19:40,176.0,172.0,136.0,946,21:56,9.0,22:11,22:05,-6.0,0,0,,,,,,,2015-09-29
110643,2015,1,8,4,MQ,2939,FWA,DFW,12:33,12:33,0.0,15.0,12:48,167.0,179.0,138.0,859,14:06,26.0,14:20,14:32,12.0,0,0,,,,,,,2015-01-08
5153289,2015,12,1,2,WN,2004,MCO,CAK,10:40,10:42,2.0,9.0,10:51,140.0,123.0,111.0,861,12:42,3.0,13:00,12:45,-15.0,0,0,,,,,,,2015-12-01


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 1742449 to 4042745
Data columns (total 31 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   Year                 1000000 non-null  int64  
 1   Month                1000000 non-null  int64  
 2   Day                  1000000 non-null  int64  
 3   Day_Of_Week          1000000 non-null  int64  
 4   Airline              1000000 non-null  object 
 5   Flight_Number        1000000 non-null  int64  
 6   Origin_Airport       1000000 non-null  object 
 7   Destination_Airport  1000000 non-null  object 
 8   Scheduled_Departure  1000000 non-null  object 
 9   Departure_Time       984945 non-null   object 
 10  Departure_Delay      984945 non-null   float64
 11  Taxi_Out             984498 non-null   float64
 12  Wheels_Off           984498 non-null   object 
 13  Scheduled_Time       1000000 non-null  float64
 14  Elapsed_Time         981891 non-null   float64
 1

In [10]:
columns_to_keep = ['Airline', 'Origin_Airport', 'Destination_Airport',
                    'Distance', 'Day', 'Month', 'Departure_Delay', 'Arrival_Delay',
                    'Scheduled_Time', 'Elapsed_Time', 'Air_Time',
                    'Taxi_In', 'Taxi_Out']

df = df[columns_to_keep]

In [11]:
df['Origin_Airport'] = df.join(airports_df.set_index('IATA_CODE'), on='Origin_Airport')['AIRPORT']
df['Destination_Airport'] = df.join(airports_df.set_index('IATA_CODE'), on='Destination_Airport')['AIRPORT']
df['Airline'] = df.join(airlines_df.set_index('IATA_CODE'), on='Airline')['AIRLINE']

In [12]:
df.isna().sum() / len(df) * 100

Airline                0.0000
Origin_Airport         0.0000
Destination_Airport    0.0000
Distance               0.0000
Day                    0.0000
Month                  0.0000
Departure_Delay        1.5055
Arrival_Delay          1.8109
Scheduled_Time         0.0000
Elapsed_Time           1.8109
Air_Time               1.8109
Taxi_In                1.6001
Taxi_Out               1.5502
dtype: float64

In [13]:
df.dropna(inplace = True)
df.reset_index(drop=True, inplace=True)

In [14]:
column_transformer = ColumnTransformer(
    [
    ("ohe", OneHotEncoder(drop='first', sparse_output = False), ["Airline", "Origin_Airport", "Destination_Airport"]),
    ],
    remainder = "passthrough"
)

In [15]:
Airline_list=df['Airline'].unique().tolist()
Origin_airport_list=df['Origin_Airport'].unique().tolist()
Destination_airport_list=df['Destination_Airport'].unique().tolist()
joblib.dump(Airline_list,"../models/Airline_list.h5")
joblib.dump(Origin_airport_list,"../models/Origin_Airport_list.h5")
joblib.dump(Destination_airport_list,"../models/Destination_Airport_list.h5")
joblib.dump(df.drop('Arrival_Delay', axis = 1).columns,"../models/input.h5")

['../models/input.h5']

In [16]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge": Ridge(),
    "KNN": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(random_state = 42),
}

In [17]:
X = df.drop("Arrival_Delay", axis = 1)
y = df["Arrival_Delay"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

scoring = {
    'r2': 'r2',
    'neg_root_mean_squared_error': make_scorer(rmse, greater_is_better=False)
}

In [15]:
for model_name, model in models.items():
    pipeline = Pipeline([
        ("column_transformer", column_transformer),
        ('PCA', PCA(n_components = 5)),
        ('scaler', MinMaxScaler()),
        ("model", model),
    ])
    scores = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
    print(f"Model: {model_name}")
    print(f"Mean Train R² Score: {scores['train_r2'].mean()}")
    print(f"Mean Train RMSE: {-scores['train_neg_root_mean_squared_error'].mean()}")
    print(f"Mean R² Score: {scores['test_r2'].mean()}")
    print(f"Mean RMSE: {-scores['test_neg_root_mean_squared_error'].mean()}")

Model: Linear Regression
Mean Train R² Score: 0.9482906440553369
Mean Train RMSE: 5.724598808462051
Mean R² Score: 0.9482846170507486
Mean RMSE: 5.724661508972055
Model: Ridge
Mean Train R² Score: 0.9482906191912228
Mean Train RMSE: 5.7246001847910675
Mean R² Score: 0.9482845899273531
Mean RMSE: 5.724662974138427
Model: KNN
Mean Train R² Score: 0.9666752535431155
Mean Train RMSE: 4.595603404570494
Mean R² Score: 0.9494648225306953
Mean RMSE: 5.659036356902055
Model: Decision Tree
Mean Train R² Score: 1.0
Mean Train RMSE: -0.0
Mean R² Score: 0.9276008208457283
Mean RMSE: 6.773480260157133


In [32]:
param_grid = {
    'model__fit_intercept': [True, False],
    'model__positive': [True, False],
    'model__copy_X': [True, False],
}

In [34]:
pipeline = Pipeline([
    ("column_transformer", column_transformer),
    ('PCA', PCA(n_components = 5)),
    ('scaler', MinMaxScaler()),
    ("model", LinearRegression()),
])
grid_search = GridSearchCV(
    estimator=pipeline,            
    cv=5, 
    scoring=scoring,
    param_grid=param_grid,
    return_train_score=True,
    refit='neg_root_mean_squared_error', 
)
grid_search.fit(X_train, y_train)
# Access and print the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Train Scores (RMSE): {grid_search.cv_results_['mean_train_neg_root_mean_squared_error']}")
print(f"Best Score (Negative RMSE): {grid_search.best_score_}")
# Access R² score for the best parameter set
best_index = grid_search.best_index_
best_r2 = grid_search.cv_results_['mean_test_r2'][best_index]
print(f"Best R² Score: {best_r2}")
print(f"Best Estimator: {grid_search.best_estimator_}")

Best Parameters: {'model__copy_X': True, 'model__fit_intercept': True, 'model__positive': False}
Train Scores (RMSE): [ -5.84969387  -5.72459881 -22.99845343 -14.45734829  -5.84969387
  -5.72459881 -22.99845343 -14.45734829]
Best Score (Negative RMSE): -5.724661508972034
Best R² Score: 0.9482846170507491
Best Estimator: Pipeline(steps=[('column_transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe',
                                                  OneHotEncoder(drop='first',
                                                                sparse_output=False),
                                                  ['Airline', 'Origin_Airport',
                                                   'Destination_Airport'])])),
                ('PCA', PCA(n_components=5)), ('scaler', MinMaxScaler()),
                ('model', LinearRegression())])


In [35]:
pipeline = Pipeline([
    ("column_transformer", column_transformer),
    ('PCA', PCA(n_components = 5)),
    ('scaler', MinMaxScaler()),
    ("model", LinearRegression(copy_X=True, fit_intercept=True, positive=False)),
])

In [36]:
pipeline.fit(X, y)

In [37]:
joblib.dump(pipeline,"../models/pipeline.h5")

['../models/pipeline.h5']