In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Load hourly data
hourly_data = pd.read_csv('hour.csv')
hourly_data.sample(4)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
11978,11979,2012-05-19,2,1,5,7,0,6,0,1,0.46,0.4545,0.67,0.1045,28,67,95
17065,17066,2012-12-18,4,1,12,20,0,2,1,1,0.36,0.3333,0.5,0.2537,4,264,268
652,653,2011-01-30,1,0,1,12,0,0,0,1,0.3,0.3182,0.52,0.1045,10,87,97
12388,12389,2012-06-05,2,1,6,9,0,2,1,2,0.52,0.5,0.72,0.2537,50,285,335


In [2]:
hourly_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [3]:
#Number of missing values 
hourly_data.isnull().sum().sum()

0

In [4]:
hourly_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,17379.0,8690.0,5017.0295,1.0,4345.5,8690.0,13034.5,17379.0
season,17379.0,2.50164,1.106918,1.0,2.0,3.0,3.0,4.0
yr,17379.0,0.502561,0.500008,0.0,0.0,1.0,1.0,1.0
mnth,17379.0,6.537775,3.438776,1.0,4.0,7.0,10.0,12.0
hr,17379.0,11.546752,6.914405,0.0,6.0,12.0,18.0,23.0
holiday,17379.0,0.02877,0.167165,0.0,0.0,0.0,0.0,1.0
weekday,17379.0,3.003683,2.005771,0.0,1.0,3.0,5.0,6.0
workingday,17379.0,0.682721,0.465431,0.0,0.0,1.0,1.0,1.0
weathersit,17379.0,1.425283,0.639357,1.0,1.0,1.0,2.0,4.0
temp,17379.0,0.496987,0.192556,0.02,0.34,0.5,0.66,1.0


### 1.01 Exercise 101: Preprocessing temporal and weather features

In [5]:
preprocessed_data = hourly_data.copy()

In [6]:
seasons_mapping = {1: 'Winter', 2:'Spring', 3:'Summer', 4:'Fall'}
preprocessed_data['season'] = preprocessed_data['season'].map(seasons_mapping)

In [7]:
yr_mapping = {0: 2011, 1: 2012}
preprocessed_data['yr'] = preprocessed_data['yr'].map(yr_mapping)

In [8]:
weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', \
3: 'Wednesday', 4: 'Thursday', 5: 'Friday', \
6: 'Saturday'}
preprocessed_data['weekday'] = preprocessed_data['weekday'].map(weekday_mapping)

In [9]:
weathersiit_mapping = {1: 'clear weather ', 2:'cloudy weather', 3:'light snow or rain', 4:' heavy snow or rain'}
preprocessed_data['weathersit'] = preprocessed_data['weathersit'].map(weathersiit_mapping)

In [10]:
preprocessed_data['hum'] = preprocessed_data['hum']*100 
preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67

In [11]:
# visualize preprocessed columns
cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed']
preprocessed_data[cols].sample(10, random_state=123)

Unnamed: 0,season,yr,weekday,weathersit,hum,windspeed
5792,Summer,2011,Saturday,clear weather,74.0,8.9981
7823,Fall,2011,Sunday,clear weather,43.0,31.0009
15426,Fall,2012,Tuesday,cloudy weather,77.0,6.0032
15028,Fall,2012,Sunday,clear weather,51.0,22.0028
12290,Spring,2012,Friday,cloudy weather,89.0,12.998
3262,Spring,2011,Friday,clear weather,64.0,7.0015
10763,Spring,2012,Thursday,clear weather,42.0,23.9994
12384,Spring,2012,Tuesday,light snow or rain,82.0,11.0014
6051,Summer,2011,Wednesday,clear weather,52.0,19.0012
948,Winter,2011,Saturday,clear weather,80.0,0.0
