Author: Justus Heilingbrunner
***

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

**Disclaimer:**

This notebook is basically just a proof of concept. The model training is not done in the "right" way because the dataset is not split into training, validation, and testing setsadequately, and hyperparameters are not optimized. The selected features may not capture all relevant information needed for accurate predictions. Moreover, data preprocessing steps such as handling missing values or outliers are not thoroughly addressed in this notebook. Thus, the results should be interpreted with caution and not be used for making critical business decisions.

This model is used and was generated to first implement one model for the prediction process in the Sprottenflotte prediction tool application. Later the model continued to be used for a simple implementation and a "baseline".

In [None]:
df = pd.read_csv('Data/FelixData_final_main_kiel.csv')
df

Unnamed: 0,index,entityId,entityType,name,availableBikeNumber,freeSlotNumber,totalSlotNumber,pickups,dropoffs
0,2024-06-17T11:00:00.000+00:00,24367,BikeHireDockingStation,Anleger Dietrichsdorf,5.032787,25.000000,30.0,4627,4683
1,2024-06-17T12:00:00.000+00:00,24367,BikeHireDockingStation,Anleger Dietrichsdorf,6.762712,24.237288,30.0,4627,4683
2,2024-06-17T13:00:00.000+00:00,24367,BikeHireDockingStation,Anleger Dietrichsdorf,6.864407,24.000000,30.0,4627,4683
3,2024-06-17T14:00:00.000+00:00,24367,BikeHireDockingStation,Anleger Dietrichsdorf,5.846154,24.153846,30.0,4627,4683
4,2024-06-17T15:00:00.000+00:00,24367,BikeHireDockingStation,Anleger Dietrichsdorf,5.305556,24.694444,30.0,4627,4683
...,...,...,...,...,...,...,...,...,...
86496,2024-09-30T19:00:00.000+00:00,26889,BikeHireDockingStation,Wilhelmplatz,1.066667,28.833333,30.0,14328,14285
86497,2024-09-30T20:00:00.000+00:00,26889,BikeHireDockingStation,Wilhelmplatz,0.000000,30.000000,30.0,14328,14285
86498,2024-09-30T21:00:00.000+00:00,26889,BikeHireDockingStation,Wilhelmplatz,0.000000,30.000000,30.0,14328,14285
86499,2024-09-30T22:00:00.000+00:00,26889,BikeHireDockingStation,Wilhelmplatz,0.000000,30.000000,30.0,14328,14285


In [None]:
# make dataframe for training

df['time_utc'] = pd.to_datetime(df['index'])

# Extract year, month, day, and hour
df['Year'] = df['time_utc'].dt.year
df['Month'] = df['time_utc'].dt.month
df['Day'] = df['time_utc'].dt.day
df['Hour'] = df['time_utc'].dt.hour

# Now create a DataFrame with month, day, hour, and available bike number
result_df = df[['entityId', 'Month', 'Day', 'Hour', 'availableBikeNumber']]
result_df

Unnamed: 0,entityId,Month,Day,Hour,availableBikeNumber
0,24367,6,17,11,5.032787
1,24367,6,17,12,6.762712
2,24367,6,17,13,6.864407
3,24367,6,17,14,5.846154
4,24367,6,17,15,5.305556
...,...,...,...,...,...
86496,26889,9,30,19,1.066667
86497,26889,9,30,20,0.000000
86498,26889,9,30,21,0.000000
86499,26889,9,30,22,0.000000


In [None]:
result_df.shape

(86501, 5)

In [None]:
# Skaliere die Features
scaler = StandardScaler()

scaled_features = scaler.fit_transform(result_df[['Month', 'Day', 'Hour', 'availableBikeNumber']])
scaled_features

array([[-1.70788759,  0.00947177, -0.08192369,  0.00828915],
       [-1.70788759,  0.00947177,  0.06316622,  0.26236968],
       [-1.70788759,  0.00947177,  0.20825613,  0.27730599],
       ...,
       [ 1.22024371,  1.54709698,  1.36897543, -0.73089503],
       [ 1.22024371,  1.54709698,  1.51406534, -0.73089503],
       [ 1.22024371,  1.54709698,  1.65915526, -0.73089503]])

In [None]:
X, y = [], []
# Loop durch eindeutige Stations-IDs
for station_id in result_df['entityId'].unique():
    station_data = scaled_features[result_df['entityId'] == station_id]

    # Beginnen mit dem Fensterungsprozess
    for i in range(len(station_data) - 29):  # 29 = 24 Stunden für Eingabe + 5 Stunden für Vorhersage
        # Die letzten 24 Stunden in den Eingabedaten
        X.append(station_data[i:i + 24])  # Nutze die letzten 24 Stunden als Eingabe
        # Die nächsten 5 Stunden für ‘availableBikeNumber‘ als Ausgabe
        y.append(station_data[i + 24:i + 29, 3]) # 3 stellt den Index für 'availableBikeNumber' dar

X, y = np.array(X), np.array(y)

In [None]:
# X

In [None]:
X.shape

(85457, 24, 4)

In [None]:
# y

In [None]:
y.shape

(85457, 5)

In [None]:
# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

(68365, 24, 4)

In [None]:
y_train.shape

(68365, 5)

In [None]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)  # Flatten der 3D Eingaben für das Training
X_train_flat.shape

(68365, 96)

In [None]:
# Training des Random Forest Regressors
model = RandomForestRegressor(n_estimators=30, random_state=42)

model.fit(X_train_flat, y_train)

In [None]:
import joblib
# Modell speichern
joblib.dump(model, 'model_rf.joblib', compress=3)

# Scaler speichern
joblib.dump(scaler, 'scaler_rf.joblib')

['scaler_rf.joblib']

testing the model

In [None]:
model_rf = joblib.load('model_rf.joblib')

scaler_rf = joblib.load('scaler_rf.joblib')

In [None]:
# Beispiel-Daten für 24 Stunden
np.random.seed(42)  # Für reproduzierbare Ergebnisse
day = 15  # Ein bestimmter Tag des Monats
month = 6  # Ein bestimmter Monat (Juni)

new_data = pd.DataFrame({
    'Month': [month] * 24,  # Monat bleibt konstant
    'Day': [day] * 24,  # Tag bleibt konstant
    'Hour': np.arange(24),  # Stunden von 0 bis 23 für einen vollständigen Tag
    'availableBikeNumber': np.random.randint(0, 50, size=24)  # Zufallszahlen für verfügbare Fahrräder
})

print("New Data:")
new_data


New Data:


Unnamed: 0,Month,Day,Hour,availableBikeNumber
0,6,15,0,38
1,6,15,1,28
2,6,15,2,14
3,6,15,3,42
4,6,15,4,7
5,6,15,5,20
6,6,15,6,38
7,6,15,7,18
8,6,15,8,22
9,6,15,9,10


In [None]:
# Daten vorverarbeiten (z. B. Skalierung)
new_data_scaled = scaler_rf.transform(new_data)

# Modell laden (falls es gespeichert ist)
# model = load_model('model_path')

# Vorhersagen generieren
new_data_scaled_flat = new_data_scaled.flatten().reshape(1, -1)  # Modell benötigt flache Eingabeform

predictions_scaled = model_rf.predict(new_data_scaled_flat)

# Inverse Transformation zur Originalskala
preds = predictions_scaled.flatten()

feature_index = 3  
num_features = new_data.shape[1]

dummy_matrix = np.zeros((preds.shape[0], num_features))
dummy_matrix[:, feature_index] = preds

predictions_original_scale = scaler_rf.inverse_transform(dummy_matrix)[:, feature_index]

data_original_scale = new_data['availableBikeNumber'].tolist()

In [None]:
print("\nData (Original Scale):")
print(data_original_scale)
print('')
print("\nPredictions (Original Scale):")
print(predictions_original_scale)


Data (Original Scale):
[38, 28, 14, 42, 7, 20, 38, 18, 22, 10, 10, 23, 35, 39, 23, 2, 21, 1, 23, 43, 29, 37, 1, 20]


Predictions (Original Scale):
[20.14520818 18.98843789 18.62812881 17.88744821 16.68366491]


In [None]:
predictions_original_scale.tolist()

[20.14520818035137,
 18.988437885576985,
 18.628128809334626,
 17.887448210922784,
 16.683664906918587]

In [None]:
new_data_scaled.shape

(24, 4)

In [None]:
new_data_scaled_flat.shape

(1, 96)

In [None]:
predictions_scaled.shape

(1, 5)

In [None]:
preds.shape

(5,)

In [None]:
dummy_matrix

array([[0.        , 0.        , 0.        , 2.22790681],
       [0.        , 0.        , 0.        , 2.05800764],
       [0.        , 0.        , 0.        , 2.0050877 ],
       [0.        , 0.        , 0.        , 1.89630118],
       [0.        , 0.        , 0.        , 1.71949704]])