In [24]:
#imports

import pandas as pd
import numpy as np

Now that we have processed the HSL bike data, the next step is to process the weather data and merge it with the HSL data into a single dataframe. 

In [25]:
weather_data = pd.DataFrame()
dataframes = []

for year in range(2016, 2022):
    csv_path = "./data/weather/" + str(year) + "-weather.csv"
    df = pd.read_csv(csv_path)
    dataframes.append(df)

weather_data = pd.concat(dataframes, ignore_index=True)
display(weather_data)


Unnamed: 0,Year,m,d,Time,Time zone,Precipitation amount (mm),Air temperature (degC),Maximum temperature (degC),Minimum temperature (degC)
0,2016,5,1,00:00,UTC,-1.0,11.1,16.4,5.1
1,2016,5,2,00:00,UTC,-1.0,12.1,17.8,5.9
2,2016,5,3,00:00,UTC,-1.0,11.9,17.7,4.6
3,2016,5,4,00:00,UTC,-1.0,14.3,20.4,5.7
4,2016,5,5,00:00,UTC,-1.0,15.5,21.5,8.9
...,...,...,...,...,...,...,...,...,...
1219,2021,10,27,00:00,UTC,7.5,7.8,9.3,6.2
1220,2021,10,28,00:00,UTC,1.8,8.2,10.6,4.3
1221,2021,10,29,00:00,UTC,-1.0,9.8,10.9,8.3
1222,2021,10,30,00:00,UTC,-1.0,8.7,9.4,7.3


Now that we have processed the weather data into a dataframe, the next step is to get rid of unnecessary columns, as well as combine the year month day columns into a single "date" column.

In [26]:
weather_data = weather_data.drop(['Time', 'Time zone'], axis=1)

weather_data['Date'] = weather_data['Year'].astype(str) + '-' + weather_data['m'].astype(str) + '-' + weather_data['d'].astype(str)
weather_data['Date'] = pd.to_datetime(weather_data['Date'], format='%Y-%m-%d').dt.date
#subtract 2016 from year 
weather_data['Year'] = weather_data['Year'] - 2016

weather_data = weather_data.rename(columns={'m': 'Month', 'd' : 'Day'})

weather_data = weather_data.iloc[1:]

display(weather_data)

Unnamed: 0,Year,Month,Day,Precipitation amount (mm),Air temperature (degC),Maximum temperature (degC),Minimum temperature (degC),Date
1,0,5,2,-1.0,12.1,17.8,5.9,2016-05-02
2,0,5,3,-1.0,11.9,17.7,4.6,2016-05-03
3,0,5,4,-1.0,14.3,20.4,5.7,2016-05-04
4,0,5,5,-1.0,15.5,21.5,8.9,2016-05-05
5,0,5,6,-1.0,14.8,19.7,8.1,2016-05-06
...,...,...,...,...,...,...,...,...
1219,5,10,27,7.5,7.8,9.3,6.2,2021-10-27
1220,5,10,28,1.8,8.2,10.6,4.3,2021-10-28
1221,5,10,29,-1.0,9.8,10.9,8.3,2021-10-29
1222,5,10,30,-1.0,8.7,9.4,7.3,2021-10-30


Now that we have both the processed HSL bike data, as well as the processed weather data, we can combine the two dataframes. 

In [27]:
bike_data = pd.read_csv('./data/HSL/processed_bike_data.csv')

bike_data['Date'] = pd.to_datetime(bike_data['Date'], format='%Y-%m-%d').dt.date
finalized_data = pd.merge(bike_data, weather_data, on='Date', how='left')

display(finalized_data)


Unnamed: 0,Station,Date,Departure Count,Return Count,Year,Month,Day,Precipitation amount (mm),Air temperature (degC),Maximum temperature (degC),Minimum temperature (degC)
0,Linnanmäki,2016-05-02,11.0,15.0,0,5,2,-1.0,12.1,17.8,5.9
1,Workshop Helsinki,2016-05-02,14.0,13.0,0,5,2,-1.0,12.1,17.8,5.9
2,Kampin metroasema,2016-05-02,17.0,12.0,0,5,2,-1.0,12.1,17.8,5.9
3,Laivasillankatu,2016-05-02,3.0,3.0,0,5,2,-1.0,12.1,17.8,5.9
4,Merisotilaantori,2016-05-02,2.0,3.0,0,5,2,-1.0,12.1,17.8,5.9
...,...,...,...,...,...,...,...,...,...,...,...
317556,Koivusaari (M),2021-10-31,7.0,10.0,5,10,31,-1.0,8.7,9.6,7.1
317557,Komeetankatu,2021-10-31,8.0,8.0,5,10,31,-1.0,8.7,9.6,7.1
317558,Toppelundintie,2021-10-31,1.0,4.0,5,10,31,-1.0,8.7,9.6,7.1
317559,Torpanranta,2021-10-31,8.0,9.0,5,10,31,-1.0,8.7,9.6,7.1


Now we have our dataframe that includes data regarding the HSL bikes and weather. The last thing we want to do is to add columns to our dataframe that can be used to help our model.

In [28]:
finalized_data['Weekend'] = finalized_data['Date'].apply(lambda x: x.strftime("%A")).apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)
finalized_data = finalized_data.drop(['Date'], axis=1)

display(finalized_data)


Unnamed: 0,Station,Departure Count,Return Count,Year,Month,Day,Precipitation amount (mm),Air temperature (degC),Maximum temperature (degC),Minimum temperature (degC),Weekend
0,Linnanmäki,11.0,15.0,0,5,2,-1.0,12.1,17.8,5.9,0
1,Workshop Helsinki,14.0,13.0,0,5,2,-1.0,12.1,17.8,5.9,0
2,Kampin metroasema,17.0,12.0,0,5,2,-1.0,12.1,17.8,5.9,0
3,Laivasillankatu,3.0,3.0,0,5,2,-1.0,12.1,17.8,5.9,0
4,Merisotilaantori,2.0,3.0,0,5,2,-1.0,12.1,17.8,5.9,0
...,...,...,...,...,...,...,...,...,...,...,...
317556,Koivusaari (M),7.0,10.0,5,10,31,-1.0,8.7,9.6,7.1,1
317557,Komeetankatu,8.0,8.0,5,10,31,-1.0,8.7,9.6,7.1,1
317558,Toppelundintie,1.0,4.0,5,10,31,-1.0,8.7,9.6,7.1,1
317559,Torpanranta,8.0,9.0,5,10,31,-1.0,8.7,9.6,7.1,1


Then we can save it to a csv file.

In [29]:
finalized_data.to_csv('./data/finalized_data.csv', index=False)