In [1]:
import sys
print(sys.executable)
#should be AI project

C:\Users\Johanna\anaconda3\envs\AI_project\python.exe


In [2]:
#import libraries
from xgboost import XGBRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from datetime import time
from functools import reduce


In [3]:
#import dataframes
df_lagged_all = pd.read_parquet("data/processed/df_lagged_all_v3.parquet")
df_2_full = pd.read_parquet("data/processed/df_2_full_v3.parquet")  #v3 for three portals


XGBoost train flow with other portal

In [4]:
nlags=15

In [8]:
#define target sensor and other portal (not direct neighbour, further away)
target_sensor = 1076

further_portal = "57055"
further_sensors = df_2_full[df_2_full['PORTAL_clean'] == further_portal]['DP_ID'].unique()

#define features corresponding to the sensors
flow_features_further = [
    f'SENSOR_{sensor}_FLOW_lag_{i+1}'
    for sensor in further_sensors
    for i in range(nlags)
]

speed_features_further = [
    f'SENSOR_{sensor}_SPEED_lag_{i+1}'
    for sensor in further_sensors
    for i in range(nlags)
]    
print(further_sensors)


[ 353 1443  749]


In [25]:
#train-test split
df_train, df_test = train_test_split(df_lagged_all, test_size=0.2, random_state=42)


In [26]:
#model flow
model_flow = XGBRegressor( #retrain with the best parameters of gridsearch for neighbour portal
    n_estimators=200,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42
)
model_flow.fit(df_train[flow_features_further], df_train['FLOW_future_sum'])
model_flow.save_model("model/xgb_flow_furtherneighbour.json")

#model speed
model_speed = XGBRegressor( #retrain with the best parameters of gridsearch for neighbour portal
    n_estimators=200,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42
)
model_speed.fit(df_train[speed_features_further], df_train['SPEED_future_mean'])
model_speed.save_model("model/xgb_speed_furtherneighbour.json")


#predict flow values
y_pred = model_flow.predict(df_test[flow_features_further])
y_test = df_test['FLOW_future_sum']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (FLOW) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

#predict speed values
y_pred = model_speed.predict(df_test[speed_features_further])
y_test = df_test['SPEED_future_mean']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (SPEED) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

XGBoost Regression (FLOW) -> RMSE: 25.015, MAE: 17.155, R²: 0.909
XGBoost Regression (SPEED) -> RMSE: 1.093, MAE: 0.575, R²: 0.530




XGBoost train with Nan-values

In [10]:
#import dataframe where nan-values are not yet filled up
df_2_wnan=pd.read_parquet("data/processed/df_2_wnan.parquet")

In [11]:
#define variables: target sensor and same/neighbour portal sensors
target_sensor = 1076
same_portal="55620"
neighbour_portal = "56160"

same_portal_sensors = df_2_wnan[df_2_wnan['PORTAL_clean'] == same_portal]['DP_ID'].unique()
same_sensors = [s for s in same_portal_sensors if s != target_sensor]


neighbour_sensors = df_2_wnan[df_2_wnan['PORTAL_clean'] == neighbour_portal]['DP_ID'].unique()
bothportals_sensors = df_2_wnan[df_2_wnan['PORTAL_clean'].isin([neighbour_portal, same_portal])]['DP_ID'].unique()
except_target_sensors = [s for s in bothportals_sensors if s != target_sensor]




In [15]:
nlags=15

In [16]:
#group features according to speed vs flow, same vs neighbour
flow_features_same = [
    f'SENSOR_{sensor}_FLOW_lag_{i+1}'
    for sensor in same_sensors
    for i in range(nlags)
]
flow_features_neighbour = [
    f'SENSOR_{sensor}_FLOW_lag_{i+1}'
    for sensor in neighbour_sensors
    for i in range(nlags)
]
speed_features_same = [
    f'SENSOR_{sensor}_SPEED_lag_{i+1}'
    for sensor in same_sensors
    for i in range(nlags)
]
speed_features_neighbour = [
    f'SENSOR_{sensor}_SPEED_lag_{i+1}'
    for sensor in neighbour_sensors
    for i in range(nlags)
]        

In [17]:
#function for building lag features
def build_lag_features(df, target_col, lags=nlags):
    df = df.sort_values('Datetime')
    for i in range(1, lags + 1):
        df[f'{target_col}_lag_{i}'] = df.groupby('DP_ID')[target_col].shift(i)
    return df


In [18]:
#build lag-features (analog to building lag-features for filledup dataframe in the other notebook)
df_lagged_wnan = build_lag_features(df_2_wnan, 'FLOW', lags=nlags)
df_lagged_wnan = build_lag_features(df_lagged_wnan,"SPEED_MS_AVG", lags=nlags)
#remove the first 15 entries as the lag-features are not fully available for that time
df_lagged_wnan = df_lagged_wnan[df_lagged_wnan["Time_only"]>= time(4, 15)]
#reordner the lagged features in columns for every sensors
df_lagged_all_wnan = []
for sensor in except_target_sensors:
    df_sensor = df_lagged_wnan[df_lagged_wnan['DP_ID'] == sensor].copy()
    #add sensor-name in column name for flow columsn
    df_sensor = df_sensor.rename(columns={
        f'FLOW_lag_{i+1}': f'SENSOR_{sensor}_FLOW_lag_{i+1}' for i in range(nlags)
    })
    #add sensor-name in column name for flow columsn
    df_sensor = df_sensor.rename(columns={
        f'SPEED_MS_AVG_lag_{i+1}': f'SENSOR_{sensor}_SPEED_lag_{i+1}' for i in range(nlags)
    })
    df_sensor = df_sensor[['Datetime'] + [f'SENSOR_{sensor}_FLOW_lag_{i+1}' for i in range(nlags)]+[f'SENSOR_{sensor}_SPEED_lag_{i+1}' for i in range(nlags)]]
    df_lagged_all_wnan.append(df_sensor)
df_lagged_all_wnan = reduce(lambda l, r: pd.merge(l, r, on='Datetime', how='inner'), df_lagged_all_wnan)
df_lagged_all_wnan=df_lagged_all_wnan.merge(df_2_wnan[df_2_wnan['DP_ID']==target_sensor][["Datetime","FLOW","SPEED_MS_AVG"]], on=['Datetime'], how='left')
#create target value (summed flow)
df_2_wnan['FLOW_future_sum'] = (
    df_2_wnan.groupby('DP_ID')['FLOW']
    .rolling(15, min_periods=15)
    .sum()
    .shift(-14) 
    .reset_index(level=0, drop=True)
)
#create target value (mean speed)
df_2_wnan['SPEED_future_mean'] = (
    df_2_wnan.groupby('DP_ID')['SPEED_MS_AVG']
    .rolling(15, min_periods=15)
    .mean()
    .shift(-14) 
    .reset_index(level=0, drop=True)
)
df_lagged_all_wnan=df_lagged_all_wnan.merge(df_2_wnan[df_2_wnan['DP_ID']==target_sensor][["Datetime","FLOW_future_sum","SPEED_future_mean"]], on=['Datetime'], how='left')
print(df_lagged_all_wnan.shape)
df_lagged_all_wnan = df_lagged_all_wnan[df_lagged_all_wnan['Datetime'].dt.time <= time(9, 45)]
print(df_lagged_all_wnan.shape)


df_lagged_all_wnan = df_lagged_all_wnan.dropna(subset=["FLOW_future_sum"])
print(df_lagged_all_wnan.isna().sum())




(73006, 155)
(69841, 155)
Datetime                       0
SENSOR_539_FLOW_lag_1        346
SENSOR_539_FLOW_lag_2        373
SENSOR_539_FLOW_lag_3        388
SENSOR_539_FLOW_lag_4        398
                            ... 
SENSOR_1254_SPEED_lag_15    4201
FLOW                           0
SPEED_MS_AVG                   0
FLOW_future_sum                0
SPEED_future_mean              0
Length: 155, dtype: int64


In [22]:
#split random testtrain
df_train, df_test = train_test_split(df_lagged_all_wnan, test_size=0.2, random_state=42)


In [23]:
#prediction from sensors in the same portal
#model flow
model_flow = XGBRegressor( #use best parameters from randomsearch without nann
    n_estimators=200,
    max_depth=7,
    learning_rate=0.05,
    subsample=1,
    colsample_bytree=0.8,
)
model_flow.fit(df_train[flow_features_same], df_train['FLOW_future_sum'])
model_flow.save_model("model/xgb_flow_same_nan.json")

#model speed
model_speed = XGBRegressor(  #use best parameters from randomsearch without nann
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.6,
    colsample_bytree=0.6,
    random_state=42
)
model_speed.fit(df_train[speed_features_same], df_train['SPEED_future_mean'])
model_speed.save_model("model/xgb_speed_same_nan.json")



#predict flow values
y_pred = model_flow.predict(df_test[flow_features_same])
y_test = df_test['FLOW_future_sum']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (FLOW) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

#predict speed values
y_pred = model_speed.predict(df_test[speed_features_same])
y_test = df_test['SPEED_future_mean']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (SPEED) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

XGBoost Regression (FLOW) -> RMSE: 25.293, MAE: 19.391, R²: 0.886
XGBoost Regression (SPEED) -> RMSE: 0.706, MAE: 0.369, R²: 0.789




In [24]:
#prediction from sensors in the neighbour portal
#model flow
model_flow = XGBRegressor(   #use best parameters from randomsearch without nann
    n_estimators=200,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42
)
model_flow.fit(df_train[flow_features_neighbour], df_train['FLOW_future_sum'])
model_flow.save_model("model/xgb_flow_neighbour_nan.json")

#model speed
model_speed = XGBRegressor(   #use best parameters from randomsearch without nann
    n_estimators=100,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    random_state=42
)
model_speed.fit(df_train[speed_features_neighbour], df_train['SPEED_future_mean'])
model_speed.save_model("model/xgb_speed_neighbour_nan.json")



#predict flow value
y_pred = model_flow.predict(df_test[flow_features_neighbour])
y_test = df_test['FLOW_future_sum']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (FLOW) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

#predict speed values
y_pred = model_speed.predict(df_test[speed_features_neighbour])
y_test = df_test['SPEED_future_mean']
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"XGBoost Regression (SPEED) -> RMSE: {rmse:.3f}, MAE: {mae:.3f}, R²: {r2:.3f}")

XGBoost Regression (FLOW) -> RMSE: 21.319, MAE: 16.166, R²: 0.919
XGBoost Regression (SPEED) -> RMSE: 0.857, MAE: 0.390, R²: 0.690


