In [39]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

In [13]:
df = pd.read_csv('Merged_Numerical.csv')
df.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,FROM,TO,AIRCRAFT,MODEL,AGE,FLIGHT TIME,STD,ATD,STA,ATA,DIFFERENCE
0,5,5,2023,6,56,18,2652,50,19,31,885,933,970,964,-6.0
1,5,5,2023,6,386,56,2652,50,19,121,750,803,804,864,60.0
2,3,5,2023,4,376,386,2652,50,19,104,450,504,617,668,51.0
3,3,5,2023,4,18,376,2652,50,19,22,360,393,384,415,31.0
4,30,4,2023,1,316,18,2652,50,19,288,885,885,1354,1354,0.0


In [14]:
df_x = df[df.columns[df.columns!='DIFFERENCE']]
ser_y = df['DIFFERENCE']

In [15]:
df_x.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,FROM,TO,AIRCRAFT,MODEL,AGE,FLIGHT TIME,STD,ATD,STA,ATA
0,5,5,2023,6,56,18,2652,50,19,31,885,933,970,964
1,5,5,2023,6,386,56,2652,50,19,121,750,803,804,864
2,3,5,2023,4,376,386,2652,50,19,104,450,504,617,668
3,3,5,2023,4,18,376,2652,50,19,22,360,393,384,415
4,30,4,2023,1,316,18,2652,50,19,288,885,885,1354,1354


In [16]:
ser_y.head()

0    -6.0
1    60.0
2    51.0
3    31.0
4     0.0
Name: DIFFERENCE, dtype: float64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df_x, ser_y, test_size=0.3, random_state=42)

In [18]:
linreg = LinearRegression().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
elasticnet=ElasticNet().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)

In [19]:
print("Linear Regression:\n")
y_pred1 = linreg.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred1))
print("MSE Score: ", mean_squared_error(y_test, y_pred1))
print("\nLasso:\n")
y_pred2 = lasso.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred2))
print("MSE Score: ", mean_squared_error(y_test, y_pred2))
print("\nElastic Net:\n")
y_pred3 = elasticnet.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred3))
print("MSE Score: ", mean_squared_error(y_test, y_pred3))
print("\nRidge:\n")
y_pred4 = ridge.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred4))
print("MSE Score: ", mean_squared_error(y_test, y_pred4))

Linear Regression:

R2 Score:  0.03507978618008112
MSE Score:  32198.93039898529

Lasso:

R2 Score:  0.034913100362700344
MSE Score:  32204.492625742976

Elastic Net:

R2 Score:  0.03492602976189485
MSE Score:  32204.061177817228

Ridge:

R2 Score:  0.035079796678146424
MSE Score:  32198.930048669823


We will normalize our data to see if it has any effect on our data:



In [21]:
# Assuming your dataframe is named 'df'

# Convert 'DAY', 'MONTH', and 'YEAR' columns to datetime format
df['DATE'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']])

# Extract additional date-based features
df['SEASON'] = df['DATE'].dt.month.apply(lambda x: 'Winter' if x in [12, 1, 2] else 'Spring' if x in [3, 4, 5] else 'Summer' if x in [6, 7, 8] else 'Autumn')
df['QUARTER'] = df['DATE'].dt.quarter
df['IS_WEEKEND'] = df['DATE'].dt.dayofweek.isin([5, 6])
df['IS_HOLIDAY'] = df['DATE'].dt.date.isin(['2023-01-01', '2023-07-04', '2023-12-25'])  # Example holidays: New Year's Day, Independence Day, Christmas Day

# Drop the original date columns if no longer needed
df = df.drop(['DATE'], axis=1)

# Print the updated dataframe
print(df.head())

   DAY  MONTH  YEAR  DAY OF WEEK  FROM   TO  AIRCRAFT  MODEL  AGE  \
0    5      5  2023            6    56   18      2652     50   19   
1    5      5  2023            6   386   56      2652     50   19   
2    3      5  2023            4   376  386      2652     50   19   
3    3      5  2023            4    18  376      2652     50   19   
4   30      4  2023            1   316   18      2652     50   19   

   FLIGHT TIME  STD  ATD   STA   ATA  DIFFERENCE  SEASON  QUARTER  IS_WEEKEND  \
0           31  885  933   970   964        -6.0  Spring        2       False   
1          121  750  803   804   864        60.0  Spring        2       False   
2          104  450  504   617   668        51.0  Spring        2       False   
3           22  360  393   384   415        31.0  Spring        2       False   
4          288  885  885  1354  1354         0.0  Spring        2        True   

   IS_HOLIDAY  
0       False  
1       False  
2       False  
3       False  
4       False  


In [22]:
# Create interaction features
df['FLIGHT_TIME_AGE_RATIO'] = df['FLIGHT TIME'] / df['AGE']
df['AGE_FLIGHT_TIME_PRODUCT'] = df['AGE'] * df['FLIGHT TIME']

# Print the updated dataframe
print(df.head())

   DAY  MONTH  YEAR  DAY OF WEEK  FROM   TO  AIRCRAFT  MODEL  AGE  \
0    5      5  2023            6    56   18      2652     50   19   
1    5      5  2023            6   386   56      2652     50   19   
2    3      5  2023            4   376  386      2652     50   19   
3    3      5  2023            4    18  376      2652     50   19   
4   30      4  2023            1   316   18      2652     50   19   

   FLIGHT TIME  ...  ATD   STA   ATA  DIFFERENCE  SEASON QUARTER  IS_WEEKEND  \
0           31  ...  933   970   964        -6.0  Spring       2       False   
1          121  ...  803   804   864        60.0  Spring       2       False   
2          104  ...  504   617   668        51.0  Spring       2       False   
3           22  ...  393   384   415        31.0  Spring       2       False   
4          288  ...  885  1354  1354         0.0  Spring       2        True   

   IS_HOLIDAY  FLIGHT_TIME_AGE_RATIO  AGE_FLIGHT_TIME_PRODUCT  
0       False               1.631579    

In [23]:
#SEASON - Every SEASON will be replaced with it's appropriate number depending on its position within the week:
season_rep_map = {"Winter" : 1, "Spring" : 2, "Summer" : 3, "Autumn" : 4}
df["SEASON"].replace(season_rep_map, inplace=True)
df.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,FROM,TO,AIRCRAFT,MODEL,AGE,FLIGHT TIME,...,ATD,STA,ATA,DIFFERENCE,SEASON,QUARTER,IS_WEEKEND,IS_HOLIDAY,FLIGHT_TIME_AGE_RATIO,AGE_FLIGHT_TIME_PRODUCT
0,5,5,2023,6,56,18,2652,50,19,31,...,933,970,964,-6.0,2,2,False,False,1.631579,589
1,5,5,2023,6,386,56,2652,50,19,121,...,803,804,864,60.0,2,2,False,False,6.368421,2299
2,3,5,2023,4,376,386,2652,50,19,104,...,504,617,668,51.0,2,2,False,False,5.473684,1976
3,3,5,2023,4,18,376,2652,50,19,22,...,393,384,415,31.0,2,2,False,False,1.157895,418
4,30,4,2023,1,316,18,2652,50,19,288,...,885,1354,1354,0.0,2,2,True,False,15.157895,5472


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178991 entries, 0 to 178990
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DAY                      178991 non-null  int64  
 1   MONTH                    178991 non-null  int64  
 2   YEAR                     178991 non-null  int64  
 3   DAY OF WEEK              178991 non-null  int64  
 4   FROM                     178991 non-null  int64  
 5   TO                       178991 non-null  int64  
 6   AIRCRAFT                 178991 non-null  int64  
 7   MODEL                    178991 non-null  int64  
 8   AGE                      178991 non-null  int64  
 9   FLIGHT TIME              178991 non-null  int64  
 10  STD                      178991 non-null  int64  
 11  ATD                      178991 non-null  int64  
 12  STA                      178991 non-null  int64  
 13  ATA                      178991 non-null  int64  
 14  DIFF

In [26]:
#is_weekend - 
df['IS_WEEKEND'] = df['IS_WEEKEND'].astype(int)
df['IS_HOLIDAY'] = df['IS_HOLIDAY'].astype(int)
df.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,FROM,TO,AIRCRAFT,MODEL,AGE,FLIGHT TIME,...,ATD,STA,ATA,DIFFERENCE,SEASON,QUARTER,IS_WEEKEND,IS_HOLIDAY,FLIGHT_TIME_AGE_RATIO,AGE_FLIGHT_TIME_PRODUCT
0,5,5,2023,6,56,18,2652,50,19,31,...,933,970,964,-6.0,2,2,0,0,1.631579,589
1,5,5,2023,6,386,56,2652,50,19,121,...,803,804,864,60.0,2,2,0,0,6.368421,2299
2,3,5,2023,4,376,386,2652,50,19,104,...,504,617,668,51.0,2,2,0,0,5.473684,1976
3,3,5,2023,4,18,376,2652,50,19,22,...,393,384,415,31.0,2,2,0,0,1.157895,418
4,30,4,2023,1,316,18,2652,50,19,288,...,885,1354,1354,0.0,2,2,1,0,15.157895,5472


In [28]:
df.to_csv('Merged_Numerical_testing.csv', index = False)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178991 entries, 0 to 178990
Data columns (total 21 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DAY                      178991 non-null  int64  
 1   MONTH                    178991 non-null  int64  
 2   YEAR                     178991 non-null  int64  
 3   DAY OF WEEK              178991 non-null  int64  
 4   FROM                     178991 non-null  int64  
 5   TO                       178991 non-null  int64  
 6   AIRCRAFT                 178991 non-null  int64  
 7   MODEL                    178991 non-null  int64  
 8   AGE                      178991 non-null  int64  
 9   FLIGHT TIME              178991 non-null  int64  
 10  STD                      178991 non-null  int64  
 11  ATD                      178991 non-null  int64  
 12  STA                      178991 non-null  int64  
 13  ATA                      178991 non-null  int64  
 14  DIFF

In [34]:
df_x = df[df.columns[df.columns != 'DIFFERENCE']]
ser_y = df['DIFFERENCE']

X_train, X_test, y_train, y_test = train_test_split(df_x, ser_y, test_size=0.2, random_state=42)

linreg = LinearRegression().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
elasticnet = ElasticNet().fit(X_train, y_train)
ridge = Ridge().fit(X_train,y_train)

print("Linear Regression:\n")
y_pred1 = linreg.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred1))
print("MSE Score: ", mean_squared_error(y_test, y_pred1))
print("\nLasso:\n")
y_pred2 = lasso.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred2))
print("MSE Score: ", mean_squared_error(y_test, y_pred2))
print("\nElastic Net:\n")
y_pred3 = elasticnet.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred3))
print("MSE Score: ", mean_squared_error(y_test, y_pred3))
print("\nRidge:\n")
y_pred4 = ridge.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred4))
print("MSE Score: ", mean_squared_error(y_test, y_pred4))

Linear Regression:

R2 Score:  0.032714302047463506
MSE Score:  31678.196181399704

Lasso:

R2 Score:  0.03266614785328936
MSE Score:  31679.773210826723

Elastic Net:

R2 Score:  0.032633312878478926
MSE Score:  31680.84854231964

Ridge:

R2 Score:  0.0327143525896908
MSE Score:  31678.194526163206


In [31]:
testing_df = pd.read_csv('Merged_Numerical_testing.csv')

In [32]:
columns_to_delete = ['TO', 'FROM', 'IS_HOLIDAY', 'IS_WEEKEND', 'AIRCRAFT']
testing_df = testing_df.drop(columns_to_delete, axis=1)
testing_df.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,MODEL,AGE,FLIGHT TIME,STD,ATD,STA,ATA,DIFFERENCE,SEASON,QUARTER,FLIGHT_TIME_AGE_RATIO,AGE_FLIGHT_TIME_PRODUCT
0,5,5,2023,6,50,19,31,885,933,970,964,-6.0,2,2,1.631579,589
1,5,5,2023,6,50,19,121,750,803,804,864,60.0,2,2,6.368421,2299
2,3,5,2023,4,50,19,104,450,504,617,668,51.0,2,2,5.473684,1976
3,3,5,2023,4,50,19,22,360,393,384,415,31.0,2,2,1.157895,418
4,30,4,2023,1,50,19,288,885,885,1354,1354,0.0,2,2,15.157895,5472


In [35]:
df_x = testing_df[testing_df.columns[testing_df.columns != 'DIFFERENCE']]
ser_y = testing_df['DIFFERENCE']

X_train, X_test, y_train, y_test = train_test_split(df_x, ser_y, test_size=0.2, random_state=42)

linreg = LinearRegression().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
elasticnet = ElasticNet().fit(X_train, y_train)
ridge = Ridge().fit(X_train,y_train)

print("Linear Regression:\n")
y_pred1 = linreg.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred1))
print("MSE Score: ", mean_squared_error(y_test, y_pred1))
print("\nLasso:\n")
y_pred2 = lasso.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred2))
print("MSE Score: ", mean_squared_error(y_test, y_pred2))
print("\nElastic Net:\n")
y_pred3 = elasticnet.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred3))
print("MSE Score: ", mean_squared_error(y_test, y_pred3))
print("\nRidge:\n")
y_pred4 = ridge.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred4))
print("MSE Score: ", mean_squared_error(y_test, y_pred4))

Linear Regression:

R2 Score:  0.03232333947796184
MSE Score:  31691.000039662544

Lasso:

R2 Score:  0.032254066574046614
MSE Score:  31693.268697872758

Elastic Net:

R2 Score:  0.032293978837226844
MSE Score:  31691.96158818868

Ridge:

R2 Score:  0.03232337925272988
MSE Score:  31690.99873705577


In [41]:
conditions = [
    (testing_df['DIFFERENCE'] < -120),
    ((testing_df['DIFFERENCE'] < -30) & (testing_df['DIFFERENCE'] > -120)),
    ((testing_df['DIFFERENCE'] < 20) & (testing_df['DIFFERENCE'] > -30)),
    ((testing_df['DIFFERENCE'] > 120) & (testing_df['DIFFERENCE'] > 20)),
    ((testing_df['DIFFERENCE'] < 300) & (testing_df['DIFFERENCE'] > 120)),
    (testing_df['DIFFERENCE'] > 300)]
    
labels = ['2 hours ahead', '20 minutes ahead', 'on time', '2 hours late', '5 hours late', 'very late']
for label, condition in zip(labels, conditions):
    testing_df[label] = np.where(condition, True, False)
    testing_df[label] = testing_df[label].astype(int)

testing_df.head()

Unnamed: 0,DAY,MONTH,YEAR,DAY OF WEEK,MODEL,AGE,FLIGHT TIME,STD,ATD,STA,...,SEASON,QUARTER,FLIGHT_TIME_AGE_RATIO,AGE_FLIGHT_TIME_PRODUCT,2 hours ahead,20 minutes ahead,on time,2 hours late,5 hours late,very late
0,5,5,2023,6,50,19,31,885,933,970,...,2,2,1.631579,589,0,0,1,0,0,0
1,5,5,2023,6,50,19,121,750,803,804,...,2,2,6.368421,2299,0,0,0,0,0,0
2,3,5,2023,4,50,19,104,450,504,617,...,2,2,5.473684,1976,0,0,0,0,0,0
3,3,5,2023,4,50,19,22,360,393,384,...,2,2,1.157895,418,0,0,0,0,0,0
4,30,4,2023,1,50,19,288,885,885,1354,...,2,2,15.157895,5472,0,0,1,0,0,0


In [42]:
df_x = testing_df[testing_df.columns[testing_df.columns != 'DIFFERENCE']]
ser_y = testing_df['DIFFERENCE']

X_train, X_test, y_train, y_test = train_test_split(df_x, ser_y, test_size=0.2, random_state=42)

linreg = LinearRegression().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)
elasticnet = ElasticNet().fit(X_train, y_train)
ridge = Ridge().fit(X_train,y_train)

print("Linear Regression:\n")
y_pred1 = linreg.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred1))
print("MSE Score: ", mean_squared_error(y_test, y_pred1))
print("\nLasso:\n")
y_pred2 = lasso.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred2))
print("MSE Score: ", mean_squared_error(y_test, y_pred2))
print("\nElastic Net:\n")
y_pred3 = elasticnet.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred3))
print("MSE Score: ", mean_squared_error(y_test, y_pred3))
print("\nRidge:\n")
y_pred4 = ridge.predict(X_test)
print("R2 Score: ", r2_score(y_test, y_pred4))
print("MSE Score: ", mean_squared_error(y_test, y_pred4))

Linear Regression:

R2 Score:  0.8936527776911287
MSE Score:  3482.826406694848

Lasso:

R2 Score:  0.8926135194342302
MSE Score:  3516.861673643237

Elastic Net:

R2 Score:  0.20880101837388798
MSE Score:  25911.43093661815

Ridge:

R2 Score:  0.8936469743773208
MSE Score:  3483.01646275967


In [43]:
testing_df.to_csv('Merged_Numerical_testing.csv', index = False)