<a href="https://colab.research.google.com/github/LuisaPolicarpo/Berlin-Bike-Sharing/blob/main/Berlin_Bike_Sharing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
# --- Connect to CityBikes API ---
url = "http://api.citybik.es/v2/networks/nextbike-berlin"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()

    # Extract station data
    stations = data['network']['stations']

    # Create a DataFrame
    df_stations = pd.DataFrame(stations)


**EDA**

In [3]:
df_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3827 entries, 0 to 3826
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           3827 non-null   object 
 1   name         3827 non-null   object 
 2   latitude     3827 non-null   float64
 3   longitude    3827 non-null   float64
 4   timestamp    3827 non-null   object 
 5   free_bikes   3827 non-null   int64  
 6   empty_slots  3826 non-null   float64
 7   extra        3827 non-null   object 
dtypes: float64(3), int64(1), object(4)
memory usage: 239.3+ KB


In [4]:
df_stations.describe()

Unnamed: 0,latitude,longitude,free_bikes,empty_slots
count,3827.0,3827.0,3827.0,3826.0
mean,52.502331,13.394178,0.715443,4.01333
std,0.036156,0.095359,1.64753,2.492778
min,52.387091,13.124609,0.0,0.0
25%,52.479977,13.32791,0.0,3.0
50%,52.504095,13.388229,0.0,4.0
75%,52.526268,13.450316,1.0,4.0
max,52.636179,13.721039,63.0,33.0


In [5]:
df_stations.isna().sum()

Unnamed: 0,0
id,0
name,0
latitude,0
longitude,0
timestamp,0
free_bikes,0
empty_slots,1
extra,0


In [6]:
df_stations['free_bikes'] = df_stations['free_bikes'].fillna(0)
df_stations['empty_slots'] = df_stations['empty_slots'].fillna(0)

In [7]:
df_stations['total_slots'] = df_stations['free_bikes'] + df_stations['empty_slots']
df_stations['occupancy_rate'] = round(df_stations['free_bikes'] / df_stations['total_slots'],2)

In [8]:
df_stations.head()

Unnamed: 0,id,name,latitude,longitude,timestamp,free_bikes,empty_slots,extra,total_slots,occupancy_rate
0,000c1394c49338dd136955cfa83f0f02,virtuell - U Ullsteinstraße (Staubenrauchbrücke),52.453146,13.384738,2025-05-02T19:57:36.044065+00:00Z,0,6.0,"{'uid': '134099782', 'number': '10209', 'slots...",6.0,0.0
1,000d6738d33fd09713f9ba3d645eb083,virtuell - Olivaer Platz (Nordwest),52.499709,13.313215,2025-05-02T19:57:36.080965+00:00Z,0,4.0,"{'uid': '136246186', 'number': '10445', 'slots...",4.0,0.0
2,00262119a775f009135c009c642de971,Katzbachstraße,52.487904,13.376718,2025-05-02T19:57:35.963132+00:00Z,4,11.0,"{'uid': '3173794', 'number': '1449', 'slots': ...",15.0,0.27
3,0029bd4fba89c502af0c1732591080cd,virtuell - Greifswalder Straße/Lehderstraße,52.54714,13.44764,2025-05-02T19:57:36.101939+00:00Z,0,4.0,"{'uid': '140006670', 'number': '11039', 'slots...",4.0,0.0
4,00455239920d92cdc60461f8b8619fac,virtuell - Linienstraße/Weydingerstraße,52.527005,13.414112,2025-05-02T19:57:36.090127+00:00Z,2,2.0,"{'uid': '138056052', 'number': '10740', 'slots...",4.0,0.5


In [9]:
df_stations.columns

Index(['id', 'name', 'latitude', 'longitude', 'timestamp', 'free_bikes',
       'empty_slots', 'extra', 'total_slots', 'occupancy_rate'],
      dtype='object')

In [10]:
df_stations = df_stations.drop(columns=['id', 'extra'])

In [11]:
df_stations

Unnamed: 0,name,latitude,longitude,timestamp,free_bikes,empty_slots,total_slots,occupancy_rate
0,virtuell - U Ullsteinstraße (Staubenrauchbrücke),52.453146,13.384738,2025-05-02T19:57:36.044065+00:00Z,0,6.0,6.0,0.00
1,virtuell - Olivaer Platz (Nordwest),52.499709,13.313215,2025-05-02T19:57:36.080965+00:00Z,0,4.0,4.0,0.00
2,Katzbachstraße,52.487904,13.376718,2025-05-02T19:57:35.963132+00:00Z,4,11.0,15.0,0.27
3,virtuell - Greifswalder Straße/Lehderstraße,52.547140,13.447640,2025-05-02T19:57:36.101939+00:00Z,0,4.0,4.0,0.00
4,virtuell - Linienstraße/Weydingerstraße,52.527005,13.414112,2025-05-02T19:57:36.090127+00:00Z,2,2.0,4.0,0.50
...,...,...,...,...,...,...,...,...
3822,virtuell - Schellingstraße (Reichpietschufer),52.505303,13.372947,2025-05-02T19:57:36.041083+00:00Z,0,4.0,4.0,0.00
3823,virtuell - Gerichtstraße/Adolfstraße,52.544916,13.367097,2025-05-02T19:57:36.141230+00:00Z,0,4.0,4.0,0.00
3824,virtuell - Zehlendorf Eiche (Clayallee),52.434834,13.260059,2025-05-02T19:57:36.235195+00:00Z,0,4.0,4.0,0.00
3825,virtuell - Schwedter Straße/Choriner Straße,52.534172,13.408529,2025-05-02T19:57:36.178359+00:00Z,0,4.0,4.0,0.00


In [12]:
np.random.seed(42)
stations = ["Alexanderplatz", "Hauptbahnhof", "Friedrichstrasse", "Kreuzberg Gorlitzer", "Prenzlauer Allee"]
data = []

for station in stations:
    for day in range(7):
        for hour in range(24):
            for _ in range(30):
                base = 5 + np.sin(hour / 3.0) * 3
                fluctuation = np.random.normal(0, 1.5)
                day_effect = -1 if day in [5, 6] else 0
                bikes = max(0, base + fluctuation + day_effect)
                data.append([station, day, hour, round(bikes)])

df = pd.DataFrame(data, columns=["station", "day_of_week", "hour", "free_bikes"])
df.head()

Unnamed: 0,station,day_of_week,hour,free_bikes
0,Alexanderplatz,0,0,6
1,Alexanderplatz,0,0,5
2,Alexanderplatz,0,0,6
3,Alexanderplatz,0,0,7
4,Alexanderplatz,0,0,5


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25200 entries, 0 to 25199
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   station      25200 non-null  object
 1   day_of_week  25200 non-null  int64 
 2   hour         25200 non-null  int64 
 3   free_bikes   25200 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 787.6+ KB


In [14]:
df['day_of_week'].unique()

array([0, 1, 2, 3, 4, 5, 6])

In [15]:
df['hour'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [22]:
df["station_encoded"], station_classes = pd.factorize(df["station"])

In [23]:
# 🧪 4. Train-Test Split
X = df[["station_encoded", "day_of_week", "hour"]]
y = df["free_bikes"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [25]:
# 📈 6. Evaluate the Model
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error: 2.33
R² Score: 0.64


In [26]:
# 💾 7. Save the model & encoder (optional, for Streamlit)
joblib.dump(model, "bike_model.pkl")
pd.Series(station_classes).to_csv("station_classes.csv", index=False)