In [1]:
# imports
import requests
import pandas as pd
import numpy as np
import datetime

This notebook was used to research the code need to get the weather forecast data to be able to give future predictions on bike availability

Two foresecasts are available from the open weather maps API:

- 2 day forecast, with 1 hour intervals
- 5 day forecast, with 3 hour intervals

In [2]:
# get 2 day forecast, data is at 1 hour intervals
weatherTwoDay = requests.get("https://api.openweathermap.org/data/2.5/onecall?lat=53.346&lon=6.26986&exclude=current,minutely&appid=98310ef86bbb250277915291623ed079")
twoDayJSON = weatherTwoDay.json()
twoDayForecast = pd.json_normalize(twoDayJSON, record_path =['hourly'])
twoDayForecast = twoDayForecast[['temp', 'wind_speed', 'humidity', 'dt']]
twoDayForecast['dt'] = pd.to_datetime(twoDayForecast['dt'], unit='s')

# get 5 day forecast, data is at 3 hour intervals
weatherFiveDay = requests.get("http://api.openweathermap.org/data/2.5/forecast?lat=53.346&lon=-6.26986&appid=98310ef86bbb250277915291623ed079")
fiveDayJSON = weatherFiveDay.json()
fiveDayForecast = pd.json_normalize(fiveDayJSON, record_path =['list'])
fiveDayForecast = fiveDayForecast[['main.temp','wind.speed', 'main.humidity', 'dt']]
forcast_fiveday = pd.json_normalize(fiveDayJSON, record_path = ['list', 'weather'])['main']
fiveDayForecast = pd.concat([forcast_fiveday, fiveDayForecast], axis=1)
fiveDayForecast = fiveDayForecast.rename(columns={"main.temp": "temp", "wind.speed": "wind_speed", "main.humidity": "humidity"})
fiveDayForecast['dt'] = pd.to_datetime(fiveDayForecast['dt'], unit='s')

In [3]:
twoDayForecast.head()

Unnamed: 0,temp,wind_speed,humidity,dt
0,283.14,5.33,61,2021-04-15 12:00:00
1,282.66,5.72,59,2021-04-15 13:00:00
2,282.09,5.91,57,2021-04-15 14:00:00
3,281.33,5.84,56,2021-04-15 15:00:00
4,280.3,5.59,57,2021-04-15 16:00:00


In [4]:
fiveDayForecast.head()

Unnamed: 0,main,temp,wind_speed,humidity,dt
0,Clouds,284.44,3.83,64,2021-04-15 15:00:00
1,Clouds,282.81,3.01,70,2021-04-15 18:00:00
2,Clouds,279.41,2.5,81,2021-04-15 21:00:00
3,Clouds,276.88,2.2,84,2021-04-16 00:00:00
4,Clear,275.77,1.89,83,2021-04-16 03:00:00


In [5]:
fiveDayForecast

Unnamed: 0,main,temp,wind_speed,humidity,dt
0,Clouds,284.44,3.83,64,2021-04-15 15:00:00
1,Clouds,282.81,3.01,70,2021-04-15 18:00:00
2,Clouds,279.41,2.5,81,2021-04-15 21:00:00
3,Clouds,276.88,2.2,84,2021-04-16 00:00:00
4,Clear,275.77,1.89,83,2021-04-16 03:00:00
5,Clouds,276.2,1.68,80,2021-04-16 06:00:00
6,Clouds,281.8,2.76,61,2021-04-16 09:00:00
7,Clouds,284.48,3.73,53,2021-04-16 12:00:00
8,Clouds,285.14,4.05,55,2021-04-16 15:00:00
9,Clouds,283.39,3.04,65,2021-04-16 18:00:00


## interpolating data

Our model is trained at 30 minute intervals, so we need to interpolate the forescast data to have it too at 30 min intervals

The original plan had been to use the 2 day forecast for the next 2 days, and the 5 day forecast for the remaining 3. However, it turns out the 2 day and 5 day forecasts are very different for the same timestamps, so rather than collating the data as I had planned I will intead just interpolate rows on the 5 day forecast to get a forecast at 30 min intervals

In [6]:
thirtyMinuteDF = pd.DataFrame(columns=['main', 'temp', 'wind_speed', 'humidity', 'dt'])

for index, row in fiveDayForecast.iterrows():
    df = pd.DataFrame([[row['main'], row['temp'], row['wind_speed'], row['humidity'], row['dt']]], columns=['main', 'temp', 'wind_speed', 'humidity', 'dt'])
    thirtyMinuteDF = thirtyMinuteDF.append(df, ignore_index=True)
    for i in range(30, 180, 30):
        new_time = row['dt'] + datetime.timedelta(minutes=i)
        df = pd.DataFrame([[row['main'], np.nan, np.nan, np.nan, new_time]], columns=['main', 'temp', 'wind_speed', 'humidity', 'dt'])
        thirtyMinuteDF = thirtyMinuteDF.append(df, ignore_index=True)
        

In [7]:
thirtyMinuteDF

Unnamed: 0,main,temp,wind_speed,humidity,dt
0,Clouds,284.44,3.83,64,2021-04-15 15:00:00
1,Clouds,,,,2021-04-15 15:30:00
2,Clouds,,,,2021-04-15 16:00:00
3,Clouds,,,,2021-04-15 16:30:00
4,Clouds,,,,2021-04-15 17:00:00
...,...,...,...,...,...
235,Clouds,,,,2021-04-20 12:30:00
236,Clouds,,,,2021-04-20 13:00:00
237,Clouds,,,,2021-04-20 13:30:00
238,Clouds,,,,2021-04-20 14:00:00


There was an issue with type of 'humidity' column here, preventing interpolating. so it has to be set to numeric

In [8]:
thirtyMinuteDF.dtypes

main                  object
temp                 float64
wind_speed           float64
humidity              object
dt            datetime64[ns]
dtype: object

In [9]:
thirtyMinuteDF['humidity'] = pd.to_numeric(thirtyMinuteDF['humidity'])

In [10]:
thirtyMinuteDF['temp'] = thirtyMinuteDF['temp'].interpolate()
thirtyMinuteDF['wind_speed'] = thirtyMinuteDF['wind_speed'].interpolate()
thirtyMinuteDF['humidity'] = thirtyMinuteDF['humidity'].interpolate()

In [11]:
thirtyMinuteDF = thirtyMinuteDF.round({'temp': 2, 'wind_speed': 2, 'humidity': 1})

In [12]:
thirtyMinuteDF

Unnamed: 0,main,temp,wind_speed,humidity,dt
0,Clouds,284.44,3.83,64.0,2021-04-15 15:00:00
1,Clouds,284.17,3.69,65.0,2021-04-15 15:30:00
2,Clouds,283.90,3.56,66.0,2021-04-15 16:00:00
3,Clouds,283.62,3.42,67.0,2021-04-15 16:30:00
4,Clouds,283.35,3.28,68.0,2021-04-15 17:00:00
...,...,...,...,...,...
235,Clouds,283.39,5.71,54.0,2021-04-20 12:30:00
236,Clouds,283.39,5.71,54.0,2021-04-20 13:00:00
237,Clouds,283.39,5.71,54.0,2021-04-20 13:30:00
238,Clouds,283.39,5.71,54.0,2021-04-20 14:00:00


### encoding data

Next we need to encode the data so that it matches the training data used

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [24]:
categories = np.array(['Clear', 'Clouds', 'Drizzle', 'Mist', 'Rain', 'Snow']).reshape(-1,1)
type_encoder = OneHotEncoder().fit(categories)
type_encoded = type_encoder.transform(np.array(thirtyMinuteDF["main"]).reshape(-1,1))
type_encoded = pd.DataFrame(type_encoded.toarray(), columns = [category for category in type_encoder.categories_[0]])
temp = thirtyMinuteDF.reset_index(drop=True)
encodedThirtyMinuteDF = pd.concat([type_encoded, temp[["temp", "wind_speed", "humidity", "dt"]]], axis = 1)

In [25]:
encodedThirtyMinuteDF

Unnamed: 0,Clear,Clouds,Drizzle,Mist,Rain,Snow,temp,wind_speed,humidity,dt
0,0.0,1.0,0.0,0.0,0.0,0.0,284.44,3.83,64.0,2021-04-15 15:00:00
1,0.0,1.0,0.0,0.0,0.0,0.0,284.17,3.69,65.0,2021-04-15 15:30:00
2,0.0,1.0,0.0,0.0,0.0,0.0,283.90,3.56,66.0,2021-04-15 16:00:00
3,0.0,1.0,0.0,0.0,0.0,0.0,283.62,3.42,67.0,2021-04-15 16:30:00
4,0.0,1.0,0.0,0.0,0.0,0.0,283.35,3.28,68.0,2021-04-15 17:00:00
...,...,...,...,...,...,...,...,...,...,...
235,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 12:30:00
236,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 13:00:00
237,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 13:30:00
238,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 14:00:00


In [26]:
encodedThirtyMinuteDF["dayOfWeek"] = encodedThirtyMinuteDF["dt"].dt.weekday
encodedThirtyMinuteDF["hour"] = encodedThirtyMinuteDF["dt"].dt.hour
encodedThirtyMinuteDF["minute"] = encodedThirtyMinuteDF["dt"].dt.minute


In [27]:
encodedThirtyMinuteDF

Unnamed: 0,Clear,Clouds,Drizzle,Mist,Rain,Snow,temp,wind_speed,humidity,dt,dayOfWeek,hour,minute
0,0.0,1.0,0.0,0.0,0.0,0.0,284.44,3.83,64.0,2021-04-15 15:00:00,3,15,0
1,0.0,1.0,0.0,0.0,0.0,0.0,284.17,3.69,65.0,2021-04-15 15:30:00,3,15,30
2,0.0,1.0,0.0,0.0,0.0,0.0,283.90,3.56,66.0,2021-04-15 16:00:00,3,16,0
3,0.0,1.0,0.0,0.0,0.0,0.0,283.62,3.42,67.0,2021-04-15 16:30:00,3,16,30
4,0.0,1.0,0.0,0.0,0.0,0.0,283.35,3.28,68.0,2021-04-15 17:00:00,3,17,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 12:30:00,1,12,30
236,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 13:00:00,1,13,0
237,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 13:30:00,1,13,30
238,0.0,1.0,0.0,0.0,0.0,0.0,283.39,5.71,54.0,2021-04-20 14:00:00,1,14,0


In [19]:
encodedThirtyMinuteDF.head()

Unnamed: 0,Clear,Clouds,Drizzle,Mist,Rain,Snow,temp,wind_speed,humidity,dt,dayOfWeek,hour,minute
0,0.0,1.0,0.0,0.0,0.0,0.0,284.44,3.83,64.0,2021-04-15 15:00:00,3,15,0
1,0.0,1.0,0.0,0.0,0.0,0.0,284.17,3.69,65.0,2021-04-15 15:30:00,3,15,30
2,0.0,1.0,0.0,0.0,0.0,0.0,283.9,3.56,66.0,2021-04-15 16:00:00,3,16,0
3,0.0,1.0,0.0,0.0,0.0,0.0,283.62,3.42,67.0,2021-04-15 16:30:00,3,16,30
4,0.0,1.0,0.0,0.0,0.0,0.0,283.35,3.28,68.0,2021-04-15 17:00:00,3,17,0


In [20]:
encodedThirtyMinuteDF = encodedThirtyMinuteDF[["Clear","Clouds", "Drizzle","Mist","Rain","Snow","dayOfWeek","hour","minute","temp","humidity","wind_speed"]]

In [21]:
encodedThirtyMinuteDF.head()

Unnamed: 0,Clear,Clouds,Drizzle,Mist,Rain,Snow,dayOfWeek,hour,minute,temp,humidity,wind_speed
0,0.0,1.0,0.0,0.0,0.0,0.0,3,15,0,284.44,64.0,3.83
1,0.0,1.0,0.0,0.0,0.0,0.0,3,15,30,284.17,65.0,3.69
2,0.0,1.0,0.0,0.0,0.0,0.0,3,16,0,283.9,66.0,3.56
3,0.0,1.0,0.0,0.0,0.0,0.0,3,16,30,283.62,67.0,3.42
4,0.0,1.0,0.0,0.0,0.0,0.0,3,17,0,283.35,68.0,3.28


In [22]:
encodedThirtyMinuteDF

Unnamed: 0,Clear,Clouds,Drizzle,Mist,Rain,Snow,dayOfWeek,hour,minute,temp,humidity,wind_speed
0,0.0,1.0,0.0,0.0,0.0,0.0,3,15,0,284.44,64.0,3.83
1,0.0,1.0,0.0,0.0,0.0,0.0,3,15,30,284.17,65.0,3.69
2,0.0,1.0,0.0,0.0,0.0,0.0,3,16,0,283.90,66.0,3.56
3,0.0,1.0,0.0,0.0,0.0,0.0,3,16,30,283.62,67.0,3.42
4,0.0,1.0,0.0,0.0,0.0,0.0,3,17,0,283.35,68.0,3.28
...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.0,1.0,0.0,0.0,0.0,0.0,1,12,30,283.39,54.0,5.71
236,0.0,1.0,0.0,0.0,0.0,0.0,1,13,0,283.39,54.0,5.71
237,0.0,1.0,0.0,0.0,0.0,0.0,1,13,30,283.39,54.0,5.71
238,0.0,1.0,0.0,0.0,0.0,0.0,1,14,0,283.39,54.0,5.71
