In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/flights.csv')
print(df.shape)
df.head()

(71175, 11)


Unnamed: 0,flight_id,airline,aircraft_type,schengen,origin,arrival_time,departure_time,day,year,is_holiday,delay
0,1,MF,Boeing 737,non-schengen,KEC,17.671481,20.671481,0,2010,False,-3.989698
1,2,EW,Airbus A330,schengen,KXZ,10.588196,13.588196,0,2010,False,3.826984
2,3,MF,Airbus A330,schengen,UVR,8.968215,11.968215,0,2010,False,-20.60288
3,4,KW,Embraer E175,schengen,YOM,6.173571,10.173571,0,2010,False,-0.682961
4,5,KW,Boeing 787,schengen,PZK,14.942155,17.942155,0,2010,False,29.889728


In [3]:
# List of categorical variables
categorical_vars = ['airline', 'aircraft_type', 'origin']

# Perform one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_vars)

# Perform label encoding
label_mapping = {True: 1, False: 0, 'schengen': 1, 'non-schengen': 0}
df_encoded['is_holiday'] = df['is_holiday'].map(label_mapping)
df_encoded['schengen'] = df['schengen'].map(label_mapping)

# Print the updated DataFrame
print(df_encoded.shape)
df_encoded.head()

(71175, 23)


Unnamed: 0,flight_id,schengen,arrival_time,departure_time,day,year,is_holiday,delay,airline_EW,airline_KW,...,aircraft_type_Boeing 737,aircraft_type_Boeing 787,aircraft_type_Embraer E175,origin_IPP,origin_JUV,origin_KEC,origin_KXZ,origin_PZK,origin_UVR,origin_YOM
0,1,0,17.671481,20.671481,0,2010,0,-3.989698,False,False,...,True,False,False,False,False,True,False,False,False,False
1,2,1,10.588196,13.588196,0,2010,0,3.826984,True,False,...,False,False,False,False,False,False,True,False,False,False
2,3,1,8.968215,11.968215,0,2010,0,-20.60288,False,False,...,False,False,False,False,False,False,False,False,True,False
3,4,1,6.173571,10.173571,0,2010,0,-0.682961,False,True,...,False,False,True,False,False,False,False,False,False,True
4,5,1,14.942155,17.942155,0,2010,0,29.889728,False,True,...,False,True,False,False,False,False,False,True,False,False


In [4]:
# Drop unnecessary columns
df_cleaned = df_encoded.drop(['flight_id', 'arrival_time', 'departure_time', 'day', 'year'], axis=1)

# Separate X and y
X = df_cleaned.drop('delay', axis=1)
y = df_cleaned['delay']

# Print the shape of X and y
print("X shape:", X.shape)
print("y shape:", y.shape)

# Print the updated DataFrame
X.head()

X shape: (71175, 17)
y shape: (71175,)


Unnamed: 0,schengen,is_holiday,airline_EW,airline_KW,airline_MF,aircraft_type_Airbus A320,aircraft_type_Airbus A330,aircraft_type_Boeing 737,aircraft_type_Boeing 787,aircraft_type_Embraer E175,origin_IPP,origin_JUV,origin_KEC,origin_KXZ,origin_PZK,origin_UVR,origin_YOM
0,0,0,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False
1,1,0,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False
2,1,0,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False
3,1,0,False,True,False,False,False,False,False,True,False,False,False,False,False,False,True
4,1,0,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Define the Gaussian Process regression model
kernel = RBF()
gpr = GaussianProcessRegressor(kernel=kernel)

In [7]:
# Train the model
gpr.fit(X_train, y_train)

In [None]:
# Make predictions on the testing set
y_pred = gpr.predict(X_test)

In [None]:
# Evaluate the performance of the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared Score:", r2)

In [None]:
# Calculate the difference between predicted and real delays
delay_diff = y_pred - y_test

# Plot histogram of delay difference
plt.hist(delay_diff, bins=30)
plt.xlabel("Delay Difference (Predicted - Real)")
plt.ylabel("Frequency")
plt.title("Histogram of Delay Difference")
plt.show()