# Pre-process the data before model training

In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data_path = "../data/london_merged.csv"
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


In [16]:
# Rename columns for better readability
df = df.rename(columns={
    "cnt": "number_of_rentals",
    "t1": "temp_C",
    "t2": "perceived_temp_C",
    "hum": "humidity_percent",
    "wind_speed": "wind_speed_kmh"
})
df.head(5)

Unnamed: 0,timestamp,number_of_rentals,temp_C,perceived_temp_C,humidity_percent,wind_speed_kmh,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0


# Convert data types

In [18]:
# Convert timestamp to datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Extract features
df["month"] = df["timestamp"].dt.month
df["day"] = df["timestamp"].dt.day
df["weekday"] = df["timestamp"].dt.weekday  # Monday=0, Sunday=6
df["hour"] = df["timestamp"].dt.hour

# Drop the original timestamp column
df = df.drop(columns=["timestamp"])

df.head(5)

Unnamed: 0,number_of_rentals,temp_C,perceived_temp_C,humidity_percent,wind_speed_kmh,weather_code,is_holiday,is_weekend,season,month,day,weekday,hour
0,182,3.0,2.0,93.0,6.0,3,0,1,3,1,4,6,0
1,138,3.0,2.5,93.0,5.0,1,0,1,3,1,4,6,1
2,134,2.5,2.5,96.5,0.0,1,0,1,3,1,4,6,2
3,72,2.0,2.0,100.0,0.0,1,0,1,3,1,4,6,3
4,47,2.0,0.0,93.0,6.5,1,0,1,3,1,4,6,4


In [19]:
# Lista kolumn do konwersji
columns_to_convert = ["season", "is_holiday", "is_weekend", "weather_code"]

# Konwersja kolumn do typu int
df[columns_to_convert] = df[columns_to_convert].astype(int)

# Sprawdzenie typów danych po konwersji
print("Data types after conversion:")
print(df.dtypes)

Data types after conversion:
number_of_rentals      int64
temp_C               float64
perceived_temp_C     float64
humidity_percent     float64
wind_speed_kmh       float64
weather_code           int64
is_holiday             int64
is_weekend             int64
season                 int64
month                  int32
day                    int32
weekday                int32
hour                   int32
dtype: object


In [20]:
# Change columns order
new_order = ["number_of_rentals", "season", "month", "day", "weekday", "hour", 
             "is_holiday", "is_weekend", "temp_C", "perceived_temp_C", 
             "humidity_percent", "wind_speed_kmh", "weather_code"]

# Zmiana kolejności
df = df[new_order]
df.head(5)

Unnamed: 0,number_of_rentals,season,month,day,weekday,hour,is_holiday,is_weekend,temp_C,perceived_temp_C,humidity_percent,wind_speed_kmh,weather_code
0,182,3,1,4,6,0,0,1,3.0,2.0,93.0,6.0,3
1,138,3,1,4,6,1,0,1,3.0,2.5,93.0,5.0,1
2,134,3,1,4,6,2,0,1,2.5,2.5,96.5,0.0,1
3,72,3,1,4,6,3,0,1,2.0,2.0,100.0,0.0,1
4,47,3,1,4,6,4,0,1,2.0,0.0,93.0,6.5,1


In [21]:
# Save DataFrame as CSV
output_path = "../data/transformed_london_merged.csv"
df.to_csv(output_path, index=False)
print(f"DataFrame saved to {output_path}")

DataFrame saved to ../data/transformed_london_merged.csv
