# Imports ⛴

In [3]:
import geopandas as gpd
import pandas as pd
import fiona
import os
import matplotlib.pyplot as plt
import folium
from zipfile import ZipFile
from folium.plugins import MarkerCluster, HeatMap, BeautifyIcon
from folium.map import LayerControl, Layer, FeatureGroup
import seaborn as sns
from shapely.geometry import Point, LineString, MultiPoint
import numpy as np
import contextily as ctx
import requests
from io import StringIO, BytesIO
import json
import datetime as dt
from ast import literal_eval
from shapely.wkt import loads
import plotly.express as px
from dotenv import load_dotenv, find_dotenv


# Data

In [4]:
weather = pd.read_csv('raw_data/weather/weather_1.csv',
#                       nrows=1000, #rm later
                      sep=';',
#                       decimal=',',
                      skiprows=4,
#                       parse_dates=[[0, 1]],
#                       dayfirst=True,
                      header=0,
                      names=['date', 'hour', 'rainfall_mm', 'snowfall_mm'],
                     )

# workaround for hour concat issue
weather['date'] = pd.to_datetime(weather['date'], format='%d-%m-%Y', errors='coerce')
weather['date_hour'] = weather.apply(lambda x: pd.to_datetime(str(x.date) + ' ' + str(x.hour), errors='coerce'), axis=1)

# workaround for decimal issue
weather['rainfall_mm'] = weather.rainfall_mm.apply(lambda x: str(x).replace(',','.'))
weather['snowfall_mm'] = weather.snowfall_mm.apply(lambda x: str(x).replace(',','.'))

In [5]:
weather2 = pd.read_csv('raw_data/weather/weather_2.csv', 
                 sep=';', 
                 skiprows=4, 
                 header=0, 
#                  decimal=',',
#                 converters={2:lambda x: x.replace(',', '.')},
#                 parse_dates=[[0, 1]],
                names=['date', 'hour', 'winds'],
                na_values={2:'',
                            3:''},
                dayfirst=True,
                )
# workaround for hour concat issue
weather2['date'] = pd.to_datetime(weather2['date'], format='%d-%m-%Y', errors='coerce')
weather2['date_hour'] = weather2.apply(lambda x: pd.to_datetime(str(x.date) + ' ' + str(x.hour), errors='coerce'), axis=1)

weather2['winds'] = weather2.winds.apply(lambda x: str(x).replace(',','.'))

In [6]:
# weather['date_hour'] = pd.to_datetime(weather['date_hour'], errors='coerce')
weather_1 = weather.dropna(subset=['date_hour'])

# weather2['date_hour'] = pd.to_datetime(weather2['date_hour'], errors='coerce')
weather_2 = weather2.dropna(subset=['date_hour'])

In [7]:
merged_weather = weather_2.merge(weather_1,
                                right_on='date_hour',
                                left_on='date_hour',
                                )

In [8]:
merged_weather.sort_values(by='date_hour').tail()

Unnamed: 0,date_x,hour_x,winds,date_hour,date_y,hour_y,rainfall_mm,snowfall_mm
443515,2021-06-22,23:20,,2021-06-22 23:20:00,2021-06-22,23:20,0.0,
443516,2021-06-22,23:30,,2021-06-22 23:30:00,2021-06-22,23:30,0.0,
443517,2021-06-22,23:30,,2021-06-22 23:30:00,2021-06-22,23:30,0.35,16.9
443518,2021-06-22,23:40,,2021-06-22 23:40:00,2021-06-22,23:40,0.4,
443519,2021-06-22,23:50,,2021-06-22 23:50:00,2021-06-22,23:50,4.2,


In [9]:
merged_weather['hourly_date'] = merged_weather.date_hour.apply(lambda x: x.floor('h'))

In [10]:
merged_weather = merged_weather.astype({'winds': float,
                      'rainfall_mm':float,
                      'snowfall_mm':float})


In [12]:
hourly_weather = merged_weather.groupby('hourly_date').mean()

In [13]:
hourly_weather.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 44352 entries, 2016-06-01 00:00:00 to 2021-06-22 23:00:00
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   winds        13304 non-null  float64
 1   rainfall_mm  43113 non-null  float64
 2   snowfall_mm  43114 non-null  float64
dtypes: float64(3)
memory usage: 1.4 MB


In [14]:
hourly_weather.to_csv('raw_data/hourly_weather.csv')

In [15]:
hourly_weather.head()

Unnamed: 0_level_0,winds,rainfall_mm,snowfall_mm
hourly_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-06-01 00:00:00,0.716667,-0.003333,14.2
2016-06-01 01:00:00,0.51,-0.013,14.166667
2016-06-01 02:00:00,0.33,-0.007,14.3
2016-06-01 03:00:00,0.18,-0.014,14.266667
2016-06-01 04:00:00,0.3,0.008,14.233333


In [16]:
hourly_weather.tail()

Unnamed: 0_level_0,winds,rainfall_mm,snowfall_mm
hourly_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-06-22 19:00:00,,0.051111,16.8
2021-06-22 20:00:00,,0.083,18.4
2021-06-22 21:00:00,,0.082,17.533333
2021-06-22 22:00:00,,0.081,16.933333
2021-06-22 23:00:00,,0.61875,16.833333


# OWM

In [17]:
# API KEY
load_dotenv(find_dotenv())
OWM_API = os.environ.get("OWM_API")

In [18]:
# init time range
range_2019 = pd.DataFrame(pd.date_range('2016-06-01', '2021-06-12', freq='h'), columns=['hour'])
range_2019.tail().hour

44084   2021-06-11 20:00:00
44085   2021-06-11 21:00:00
44086   2021-06-11 22:00:00
44087   2021-06-11 23:00:00
44088   2021-06-12 00:00:00
Name: hour, dtype: datetime64[ns]

In [19]:
req = 'http://history.openweathermap.org/data/2.5/history/wdl'
start = range_2019.hour.min().value
inter = range_2019.hour.max().value
end = range_2019.hour.max().value
# tail1 = tail.min().value
# tail2 = tail.max().value
params = {
    'id':'3165524', # ID of Turin
    'type':'hour',
    'start':str(start)[:10], # unix time
    'end':str(end)[:10],
    'appid': OWM_API
}

r = requests.get(req, params=params)


# with open('data/weather.txt', 'w') as outfile:
#     json.dump(r.json(), outfile)
    
weather = r.json()
lst = weather.get('list')
dct = {x.get('dt'):x.get('weather')[0].get('main') for x in lst}
weather_df = pd.DataFrame.from_dict(dct, 
                                    orient='index', 
                                    columns=['weather']).reset_index().rename(columns={'index':'time'})
weather_df['rain'] = weather_df.weather == 'Rain'

In [31]:
lst = weather.get('list')
dct = {x.get('dt'):x.get('main').get('temp') for x in lst}

In [32]:
weather_df = pd.DataFrame.from_dict(dct, 
                                    orient='index', 
                                    columns=['temp']).reset_index().rename(columns={'index':'time'})
weather_df['temp'] = weather_df.temp-273.15
weather_df['time'] = pd.to_datetime(weather_df.time, unit='s')

In [33]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30189 entries, 0 to 30188
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   time    30189 non-null  datetime64[ns]
 1   temp    30189 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 471.8 KB


In [34]:
merge_all = weather_df.merge(hourly_weather, left_on='time', right_index=True)

In [40]:
merge_all.to_csv('raw_data/all_weather.csv')

In [37]:
merge_all

Unnamed: 0,time,temp,winds,rainfall_mm,snowfall_mm
0,2018-01-01 00:00:00,1.04,0.366667,-0.010,2.600000
1,2018-01-01 01:00:00,1.09,0.590000,0.009,2.600000
2,2018-01-01 02:00:00,1.05,0.450000,0.008,2.266667
3,2018-01-01 03:00:00,0.89,0.400000,0.006,2.266667
4,2018-01-01 04:00:00,0.73,0.780000,-0.011,2.300000
...,...,...,...,...,...
30184,2021-06-11 20:00:00,22.36,,-0.004,26.900000
30185,2021-06-11 21:00:00,22.02,,0.142,24.266667
30186,2021-06-11 22:00:00,20.44,,0.325,20.433333
30187,2021-06-11 23:00:00,20.14,,0.506,19.666667


In [39]:
px.line(merge_all)

ValueError: Plotly Express cannot process wide-form data with columns of different type.