In [876]:
import pandas as pd
import datetime

In [877]:
df = pd.read_csv("datasets/hour.csv")

In [878]:
df.shape

(17379, 17)

In [879]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [880]:
df.isna().any()

instant       False
dteday        False
season        False
yr            False
mnth          False
hr            False
holiday       False
weekday       False
workingday    False
weathersit    False
temp          False
atemp         False
hum           False
windspeed     False
casual        False
registered    False
cnt           False
dtype: bool

In [881]:
df['dteday'] = pd.to_datetime(df['dteday'])
df['dtime'] = df.apply(lambda row: datetime.datetime(year=row['dteday'].year, month=row['dteday'].month,
                                                      day=row['dteday'].day, hour=row['hr']), axis=1)



In [882]:
df = df.iloc[:,[0,1,17,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]

In [883]:
df.dtypes

instant                int64
dteday        datetime64[ns]
dtime         datetime64[ns]
season                 int64
yr                     int64
mnth                   int64
hr                     int64
holiday                int64
weekday                int64
workingday             int64
weathersit             int64
temp                 float64
atemp                float64
hum                  float64
windspeed            float64
casual                 int64
registered             int64
cnt                    int64
dtype: object

In [884]:
df.rename(columns={'dteday': 'dte', 'season':'season_num'}, inplace=True)

In [885]:
df['season_name'] = df['season_num'].apply(lambda x: 
                                           'winter' if x == 1 else 
                                           'spring' if x == 2 else 
                                           'summer' if x == 3 else 
                                           'fall')

In [886]:
df.head()

Unnamed: 0,instant,dte,dtime,season_num,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,season_name
0,1,2011-01-01,2011-01-01 00:00:00,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,winter
1,2,2011-01-01,2011-01-01 01:00:00,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,winter
2,3,2011-01-01,2011-01-01 02:00:00,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,winter
3,4,2011-01-01,2011-01-01 03:00:00,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,winter
4,5,2011-01-01,2011-01-01 04:00:00,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,winter


In [887]:
df.head()

Unnamed: 0,instant,dte,dtime,season_num,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,season_name
0,1,2011-01-01,2011-01-01 00:00:00,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,winter
1,2,2011-01-01,2011-01-01 01:00:00,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,winter
2,3,2011-01-01,2011-01-01 02:00:00,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,winter
3,4,2011-01-01,2011-01-01 03:00:00,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,winter
4,5,2011-01-01,2011-01-01 04:00:00,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,winter


In [888]:
df['season_name'].value_counts()

summer    4496
spring    4409
winter    4242
fall      4232
Name: season_name, dtype: int64

In [889]:
seasons = df.groupby('season_name')

In [890]:
seasons['cnt'].mean()

season_name
fall      198.868856
spring    208.344069
summer    236.016237
winter    111.114569
Name: cnt, dtype: float64

#### Unnormalizing weather data


temp : Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)

atemp: Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max+50 (only in hourly scale)

hum: Normalized humidity. The values are divided to 100 (max)

windspeed: Normalized wind speed. The values are divided to 67 (max)


In [891]:
df['humidity'] = df['hum'] * 100
df.drop(['temp', 'atemp', 'hum','windspeed'], axis=1, inplace=True)

In [892]:
df.head()

Unnamed: 0,instant,dte,dtime,season_num,yr,mnth,hr,holiday,weekday,workingday,weathersit,casual,registered,cnt,season_name,humidity
0,1,2011-01-01,2011-01-01 00:00:00,1,0,1,0,0,6,0,1,3,13,16,winter,81.0
1,2,2011-01-01,2011-01-01 01:00:00,1,0,1,1,0,6,0,1,8,32,40,winter,80.0
2,3,2011-01-01,2011-01-01 02:00:00,1,0,1,2,0,6,0,1,5,27,32,winter,80.0
3,4,2011-01-01,2011-01-01 03:00:00,1,0,1,3,0,6,0,1,3,10,13,winter,75.0
4,5,2011-01-01,2011-01-01 04:00:00,1,0,1,4,0,6,0,1,0,1,1,winter,75.0


In [893]:
import requests
import json 

class WeatherData():


    def __init__(self):

        self.base_url = "https://archive-api.open-meteo.com/v1/archive"
        self.coordinates = '?latitude=38.8951&longitude=-77.0364'
        self.timezone = '&timezone=America%2FNew_York'

    def get_weather(self, start_date = '2011-01-01', end_date = '2012-12-31', 
                        hourly_list=['precipitation,cloudcover,windspeed_10m','temperature_2m'],
                        daily_list = ['rain_sum,winddirection_10m_dominant']):
            
            timeframe = ('&start_date='+ start_date + '&end_date=' + end_date)
            hourly = ('&hourly=' + (",".join(hourly_list)))
            api_url = (self.base_url + self.coordinates + timeframe + hourly + self.timezone)
            resp = requests.get(api_url)
            weather_data = resp.json()
            
            return weather_data
    
    def concat_to_df(self,df,my_weather):
        # Extract the data
        for measure in my_weather['hourly']:

            hourly_measure = my_weather['hourly'][measure]
            unit_of_measure = my_weather['hourly_units'][measure]

            # Iterate over the rows in the DataFrame
            for index, row in df.iterrows():
                custom_index = index 
                measure_value = hourly_measure[custom_index]

                # Assign the value to the respective row in the DataFrame
                name = measure + '_' + unit_of_measure
                df.at[index, name] = measure_value

            
dc_weather = WeatherData()
my_weather = dc_weather.get_weather()

In [894]:
# Extract the hourly data from the JSON object
hourly_data = my_weather['hourly']

# Create a DataFrame from the hourly data
df_api = pd.DataFrame(hourly_data)

In [895]:
df_api['dtime'] = pd.to_datetime(df_api['time'])
df_api.drop('time',axis=1, inplace=True)
df.drop(['dte', 'mnth', 'hr', 'yr'], axis=1, inplace=True)

In [896]:
final_df = df.merge(df_api, on='dtime', how='left')
df = final_df.copy()

In [897]:
df.head()

Unnamed: 0,instant,dtime,season_num,holiday,weekday,workingday,weathersit,casual,registered,cnt,season_name,humidity,precipitation,cloudcover,windspeed_10m,temperature_2m
0,1,2011-01-01 00:00:00,1,0,6,0,1,3,13,16,winter,81.0,0.0,20,8.3,1.8
1,2,2011-01-01 01:00:00,1,0,6,0,1,8,32,40,winter,80.0,0.0,7,8.8,1.7
2,3,2011-01-01 02:00:00,1,0,6,0,1,5,27,32,winter,80.0,0.0,8,8.4,1.6
3,4,2011-01-01 03:00:00,1,0,6,0,1,3,10,13,winter,75.0,0.0,0,8.3,1.3
4,5,2011-01-01 04:00:00,1,0,6,0,1,0,1,1,winter,75.0,0.0,30,9.3,1.3


In [901]:
df['cloudyness'] = df['cloudcover'].apply(lambda x: 
                                          'No Clouds' if x <= 5 else
                                          'Low Clouds' if x <= 30 else
                                          'Medium Clouds' if x <= 50 else
                                          'Very Cloudy' if x <= 90 else
                                          'Fully Covered' )