In [1]:
import pandas as pd
import numpy as np
import os
import sys
import sqlalchemy as sqla
import csv
import matplotlib.pyplot as plt
import plotly.express as px
from shapely import wkb

## DATABASE CONNECTION

In [2]:
DB_USER = "postgres"
DB_PASSWORD = "a"
DB_HOST = "localhost"
DB_PORT = "5432"
DB_NAME = "postgres"

connection_string = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

# Créer un moteur SQLAlchemy
engine = sqla.create_engine(connection_string)

# Tester la connexion
try:
    with engine.connect() as connection:
        print("Connected to PostgreSQL successfully!")
except Exception as e:
    print(f"Error: {e}")

connection = engine.connect()

Connected to PostgreSQL successfully!


## IMPORT DATA

In [9]:
connection.rollback()

In [102]:
query = """
SELECT 
w.date,
EXTRACT(HOUR FROM w.hour) AS hour_of_day,
AVG(w.temperature) AS avg_temperature,
AVG(w.dewpoint) AS avg_dewpoint,
AVG(w.relative_humidity) AS avg_relative_humidity,
AVG(w.precipitation) AS avg_precipitation,
AVG(w.snowfall) AS avg_snowfall,
AVG(w.wind_direction) AS avg_wind_direction,
AVG(w.wind_speed) AS avg_wind_speed,
AVG(w.pressure) AS avg_pressure
FROM WEATHER w
GROUP BY w.date, hour_of_day
ORDER BY w.date, hour_of_day;
"""

weather = pd.read_sql(sqla.text(query), connection)

print(weather.head())
print(weather.shape)


         date  hour_of_day  avg_temperature  avg_dewpoint  \
0  2014-01-01          0.0         5.400000      3.845455   
1  2014-01-01          1.0         5.940000      4.150000   
2  2014-01-01          2.0         5.609091      3.681818   
3  2014-01-01          3.0         5.072727      3.145455   
4  2014-01-01          4.0         5.100000      3.118182   

   avg_relative_humidity  avg_precipitation  avg_snowfall  avg_wind_direction  \
0              90.818182           0.045455           0.0          208.181818   
1              89.300000           0.080000           0.0          210.000000   
2              88.454545           0.009091           0.0          212.727273   
3              88.545455           0.000000           0.0          206.363636   
4              88.000000           0.009091           0.0          200.000000   

   avg_wind_speed  avg_pressure  
0       18.327273   1009.318187  
1       18.360000   1009.379999  
2       18.000000   1009.945451  
3       17

In [103]:
query = """
    SELECT 
        t.departure_date AS date,
        EXTRACT(HOUR FROM t.real_time_departure) AS hour_of_day,
        AVG(t.delay_arrival) AS avg_delay_arrival,
        AVG(t.delay_departure) AS avg_delay_departure
    FROM TRAIN_DATA t
    GROUP BY t.departure_date, hour_of_day
    ORDER BY t.departure_date, hour_of_day;
"""

train_data = pd.read_sql(sqla.text(query), connection)

print(train_data.head())
print(train_data.shape)

         date  hour_of_day  avg_delay_arrival  avg_delay_departure
0  2014-01-01          0.0         294.669767           342.465116
1  2014-01-01          1.0          22.523810            47.571429
2  2014-01-01          4.0          15.243902            29.804878
3  2014-01-01          5.0          21.412399            33.870620
4  2014-01-01          6.0          28.685057            41.846743
(94434, 4)


In [3]:
query = """
    SELECT * FROM TYPE_DAY;
"""

type_day = pd.read_sql(sqla.text(query), connection)

print(type_day.head())
print(type_day.shape)

         date  holiday  weekend  day_after_rest
0  2014-01-01        2    False           False
1  2014-01-02        1    False           False
2  2014-01-03        1    False           False
3  2014-01-04        1     True           False
4  2014-01-05        1     True           False
(4018, 4)


In [105]:
merged_df = train_data.merge(weather, on=["date", "hour_of_day"], how="inner")

merged_df = type_day.merge(merged_df, on=["date"], how="inner")

merged_df['date'] = pd.to_datetime(merged_df['date'])

merged_df['date'] = merged_df['date'].dt.dayofyear

merged_df['day_sin'] = np.sin(2 * np.pi * merged_df['date'] / 365)
merged_df['day_cos'] = np.cos(2 * np.pi * merged_df['date'] / 365)

# Drop the original 'date' column
merged_df.drop(columns=['date'], inplace=True)

print(merged_df.shape)


(90390, 16)


In [106]:
print (merged_df.head())

   holiday  weekend  day_after_rest  hour_of_day  avg_delay_arrival  \
0        2    False           False          0.0         294.669767   
1        2    False           False          1.0          22.523810   
2        2    False           False          4.0          15.243902   
3        2    False           False          5.0          21.412399   
4        2    False           False          6.0          28.685057   

   avg_delay_departure  avg_temperature  avg_dewpoint  avg_relative_humidity  \
0           342.465116         5.400000      3.845455              90.818182   
1            47.571429         5.940000      4.150000              89.300000   
2            29.804878         5.100000      3.118182              88.000000   
3            33.870620         4.945455      2.981818              88.181818   
4            41.846743         4.909091      2.918182              88.000000   

   avg_precipitation  avg_snowfall  avg_wind_direction  avg_wind_speed  \
0           0.0454

In [43]:

x = merged_df[["avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

Mean Squared Error:  531337.9124018488
R2 Score:  -0.13529179128208701
Mean Absolute Error:  163.91384669308547


In [25]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  507494.5035701079
R2 Score:  -0.08434638405433037
Mean Absolute Error:  157.56706061770157


In [112]:
x = merged_df[["day_cos", "day_sin","holiday", "weekend", "day_after_rest","hour_of_day","avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

knn = KNeighborsRegressor(n_neighbors=250)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  481390.109166755
R2 Score:  0.013408413587793877
Mean Absolute Error:  138.94037823182717


In [113]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  428439.7794977397
R2 Score:  0.12192819568225077
Mean Absolute Error:  107.03895679989093


## PER DAY BY YEAR

In [4]:
query = """
    SELECT 
        t.departure_date AS date,
        AVG(t.delay_arrival) AS avg_delay_arrival,
        AVG(t.delay_departure) AS avg_delay_departure
    FROM TRAIN_DATA t
    GROUP BY t.departure_date
    ORDER BY t.departure_date;
"""

train_data = pd.read_sql(sqla.text(query), connection)

print(train_data.head())
print(train_data.shape)

         date  avg_delay_arrival  avg_delay_departure
0  2014-01-01          38.000104            47.338924
1  2014-01-02          61.479792            69.302843
2  2014-01-03         131.355574           137.877869
3  2014-01-04          73.400835            81.274319
4  2014-01-05          63.745558            71.634720
(4018, 3)


In [5]:
query = """
SELECT 
w.date,
AVG(w.temperature) AS avg_temperature,
AVG(w.dewpoint) AS avg_dewpoint,
AVG(w.relative_humidity) AS avg_relative_humidity,
AVG(w.precipitation) AS avg_precipitation,
AVG(w.snowfall) AS avg_snowfall,
AVG(w.wind_direction) AS avg_wind_direction,
AVG(w.wind_speed) AS avg_wind_speed,
AVG(w.pressure) AS avg_pressure
FROM WEATHER w
GROUP BY w.date
ORDER BY w.date;
"""

weather = pd.read_sql(sqla.text(query), connection)

print(weather.head())
print(weather.shape)

         date  avg_temperature  avg_dewpoint  avg_relative_humidity  \
0  2014-01-01         6.834615      4.222308              84.692308   
1  2014-01-02         8.257252      5.640076              84.694656   
2  2014-01-03         8.838931      5.646947              81.770992   
3  2014-01-04         7.535938      4.859375              84.109375   
4  2014-01-05         4.975379      2.743182              86.128788   

   avg_precipitation  avg_snowfall  avg_wind_direction  avg_wind_speed  \
0           0.005769           0.0          187.846154       24.132693   
1           0.029008           0.0          202.900763       22.245802   
2           0.017557           0.0          204.809160       26.137023   
3           0.000000           0.0          179.062500       19.125000   
4           0.014015           0.0          206.742424       17.959091   

   avg_pressure  
0   1005.357310  
1   1000.038171  
2   1003.428245  
3   1001.156251  
4   1006.122729  
(4019, 9)


In [None]:
merged_df = train_data.merge(weather, on=["date"], how="inner")

merged_df = type_day.merge(merged_df, on=["date"], how="inner")

merged_df['date'] = pd.to_datetime(merged_df['date'])

#merged_df = merged_df[merged_df['date'] < '2020-01-01']

merged_df['date'] = merged_df['date'].dt.dayofyear

merged_df['day_sin'] = np.sin(2 * np.pi * merged_df['date'] / 365)
merged_df['day_cos'] = np.cos(2 * np.pi * merged_df['date'] / 365)

# Drop the original 'date' column
merged_df.drop(columns=['date'], inplace=True)

print(merged_df.shape)
print(merged_df.head())


(4018, 15)
   holiday  weekend  day_after_rest  avg_delay_arrival  avg_delay_departure  \
0        2    False           False          38.000104            47.338924   
1        1    False           False          61.479792            69.302843   
2        1    False           False         131.355574           137.877869   
3        1     True           False          73.400835            81.274319   
4        1     True           False          63.745558            71.634720   

   avg_temperature  avg_dewpoint  avg_relative_humidity  avg_precipitation  \
0         6.834615      4.222308              84.692308           0.005769   
1         8.257252      5.640076              84.694656           0.029008   
2         8.838931      5.646947              81.770992           0.017557   
3         7.535938      4.859375              84.109375           0.000000   
4         4.975379      2.743182              86.128788           0.014015   

   avg_snowfall  avg_wind_direction  avg_wind

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error

x = merged_df[["day_sin","day_cos","holiday","weekend", "day_after_rest", "avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

knn = KNeighborsRegressor(n_neighbors=100)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))



Mean Squared Error:  3241.8533173268224
R2 Score:  -0.04886225902176444
Mean Absolute Error:  40.84741081486904


In [98]:

rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  2697.6539911035447
R2 Score:  0.1463954654610069
Mean Absolute Error:  33.971029463448154


In [24]:
from sklearn.kernel_ridge import KernelRidge

kr = KernelRidge(alpha=1.0)
kr.fit(x_train, y_train)

y_pred = kr.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  2440.1007524901443
R2 Score:  0.21053504369904663
Mean Absolute Error:  35.32359798678172


In [100]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))

Mean Squared Error:  2703.0542932978537
R2 Score:  0.14468667609953012
Mean Absolute Error:  36.4728087372331


In [None]:
from sklearn.ensemble import BaggingRegressor

br = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)
br.fit(x_train, y_train)

y_pred = br.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))


Mean Squared Error:  2401.331954487562
R2 Score:  0.24015909673966462
Mean Absolute Error:  33.35014114770965


In [None]:
#Doinh it with date before 2020 01-01
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

x = merged_df[["day_sin","day_cos","holiday","weekend", "day_after_rest", "avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]



x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

br = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)
br.fit(x_train, y_train)

y_pred = br.predict(x_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))




Mean Squared Error:  2191.5137173883195
R2 Score:  0.29096235908894197
Mean Absolute Error:  32.651019344569974



## Testing with classifier

In [35]:


x = merged_df[["day_sin","day_cos","holiday","weekend", "day_after_rest", "avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]

median_y = y.median()
print(median_y)

new_y = y.apply(lambda x: 1 if x > median_y else 0)
print(new_y.shape)


x_train, x_test, y_train, y_test = train_test_split(x, new_y, test_size=0.3, random_state=42)

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))



126.34536788896814
(4018,)
Accuracy:  0.736318407960199
Confusion Matrix: 
 [[449 152]
 [166 439]]


In [36]:
query = """
SELECT 
w.date,
EXTRACT(HOUR FROM w.hour) AS hour_of_day,
AVG(w.temperature) AS avg_temperature,
AVG(w.dewpoint) AS avg_dewpoint,
AVG(w.relative_humidity) AS avg_relative_humidity,
AVG(w.precipitation) AS avg_precipitation,
AVG(w.snowfall) AS avg_snowfall,
AVG(w.wind_direction) AS avg_wind_direction,
AVG(w.wind_speed) AS avg_wind_speed,
AVG(w.pressure) AS avg_pressure
FROM WEATHER w
GROUP BY w.date, hour_of_day
ORDER BY w.date, hour_of_day;
"""

weather = pd.read_sql(sqla.text(query), connection)

print(weather.head())
print(weather.shape)

query = """
    SELECT 
        t.departure_date AS date,
        EXTRACT(HOUR FROM t.real_time_departure) AS hour_of_day,
        AVG(t.delay_arrival) AS avg_delay_arrival,
        AVG(t.delay_departure) AS avg_delay_departure
    FROM TRAIN_DATA t
    GROUP BY t.departure_date, hour_of_day
    ORDER BY t.departure_date, hour_of_day;
"""

train_data = pd.read_sql(sqla.text(query), connection)

print(train_data.head())
print(train_data.shape)

merged_df = train_data.merge(weather, on=["date", "hour_of_day"], how="inner")

merged_df = type_day.merge(merged_df, on=["date"], how="inner")

merged_df['date'] = pd.to_datetime(merged_df['date'])

merged_df['date'] = merged_df['date'].dt.dayofyear

merged_df['day_sin'] = np.sin(2 * np.pi * merged_df['date'] / 365)
merged_df['day_cos'] = np.cos(2 * np.pi * merged_df['date'] / 365)

# Drop the original 'date' column
merged_df.drop(columns=['date'], inplace=True)

print(merged_df.shape)



         date  hour_of_day  avg_temperature  avg_dewpoint  \
0  2014-01-01          0.0         5.400000      3.845455   
1  2014-01-01          1.0         5.940000      4.150000   
2  2014-01-01          2.0         5.609091      3.681818   
3  2014-01-01          3.0         5.072727      3.145455   
4  2014-01-01          4.0         5.100000      3.118182   

   avg_relative_humidity  avg_precipitation  avg_snowfall  avg_wind_direction  \
0              90.818182           0.045455           0.0          208.181818   
1              89.300000           0.080000           0.0          210.000000   
2              88.454545           0.009091           0.0          212.727273   
3              88.545455           0.000000           0.0          206.363636   
4              88.000000           0.009091           0.0          200.000000   

   avg_wind_speed  avg_pressure  
0       18.327273   1009.318187  
1       18.360000   1009.379999  
2       18.000000   1009.945451  
3       17

In [37]:
x = merged_df[["day_sin","day_cos","holiday","weekend", "day_after_rest", "avg_temperature", "avg_dewpoint", "avg_relative_humidity", "avg_precipitation", "avg_snowfall", "avg_wind_direction", "avg_wind_speed", "avg_pressure"]]
y = merged_df["avg_delay_departure"]

median_y = y.median()
print(median_y)

new_y = y.apply(lambda x: 1 if x > median_y else 0)

x_train, x_test, y_train, y_test = train_test_split(x, new_y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

116.14999774464917
Accuracy:  0.7485710071173065
Confusion Matrix: 
 [[10296  3230]
 [ 3588 10003]]
