# Loading Data set and Analysis Features

<br>
Associated task:<br>
Predication of bike rental count hourly or daily based on the environmental and seasonal settings.<br>
The grund truth is "cnt" which is a dependent variable and all of the other columns are potentially independent variables as an input<br>


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
Mydata=pd.read_csv('hour.csv',delimiter=",")
Mydata.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [2]:
df=Mydata.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv
	
	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered

# First step: Removing Irrelevant input- feature data that has no effect on our prediction/Feature selection

In [3]:
df=df.drop(['instant'],axis=1)
df=df.drop(['yr'],axis=1)
df=df.drop(['dteday'],axis=1)
df.head()

Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


The numbers in columns: "season,month,weekday,weathersit" are categorical numbers which inherently represent some reasons from the list and they do not have  mathematically values to perform comparison between them e.g 1 is not greater than 12 here, we just prefer to use numbers instead of characters and text.
The solution is Dummy variables and one hot coding, we use .get_dummies()

In [4]:
df.shape

(17379, 14)

In [5]:
df['season'].unique()

array([1, 2, 3, 4])

In [6]:
season_columns=pd.get_dummies(df['season'])
season_columns.head()

Unnamed: 0,1,2,3,4
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [7]:
season_columns['check']=season_columns.sum(axis=1)
season_columns.head()

Unnamed: 0,1,2,3,4,check
0,1,0,0,0,1
1,1,0,0,0,1
2,1,0,0,0,1
3,1,0,0,0,1
4,1,0,0,0,1


In [8]:
season_columns.shape

(17379, 5)

We just check again wheather the sum of all elements in ckeck column is equal to the total rows number to make sure that our one-hot-coding was successful

In [9]:
season_columns['check'].sum(axis=0)

17379

Now we can drop check column

In [10]:
season_columns=season_columns.drop(['check'],axis=1)
season_columns.head()

Unnamed: 0,1,2,3,4
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [11]:
season_1=season_columns.loc[:,1]
season_2=season_columns.loc[:,2]
season_3=season_columns.loc[:,3]
season_4=season_columns.loc[:,4]

In [12]:
df=pd.concat([df,season_1,season_2,season_3,season_4],axis=1)
df.head()

Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,1,2,3,4
0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,1,0,0,0
1,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,1,0,0,0
2,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,1,0,0,0
3,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,1,0,0,0
4,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,1,0,0,0


In [13]:
column_names=['season', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual',
       'registered', 'cnt', 'season_1','season_2','season_3','season_4']
df.columns=column_names
df.head()

Unnamed: 0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,season_1,season_2,season_3,season_4
0,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,1,0,0,0
1,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,1,0,0,0
2,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,1,0,0,0
3,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,1,0,0,0
4,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,1,0,0,0


# Reorder Columns and eliminating season column

In [14]:
df=df.drop(['season'],axis=1)
column_names_reordered=['season_1','season_2','season_3','season_4', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual',
       'registered', 'cnt']
df=df[column_names_reordered]
df.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,0,0,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,0,0,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,1,0,0,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,1,0,0,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Create Checkpoint for first feature modification<br>
Create a copy of the current state of the df data frame

In [84]:
df_season_mod=df.copy()

# The next feature that has to be modified is "mnth"

In [85]:
df_season_mod['mnth'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [86]:
df_season_mod['mnth'].value_counts()

7     1488
5     1488
12    1483
8     1475
3     1473
10    1451
6     1440
11    1437
9     1437
4     1437
1     1429
2     1341
Name: mnth, dtype: int64

In [87]:
mnth_columns=pd.get_dummies(df['mnth'])
mnth_columns.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12
0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0


In [88]:
mnth_columns.shape

(17379, 12)

In [89]:
mnth_columns['check']=mnth_columns.sum(axis=1)
mnth_columns.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,check
0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,1


In [90]:
mnth_columns['check'].sum(axis=0)

17379

In [91]:
mnth_columns=mnth_columns.drop(['check'],axis=1)
mnth_1=mnth_columns.loc[:,1]
mnth_2=mnth_columns.loc[:,2]
mnth_3=mnth_columns.loc[:,3]
mnth_4=mnth_columns.loc[:,4]
mnth_5=mnth_columns.loc[:,5]
mnth_6=mnth_columns.loc[:,6]
mnth_7=mnth_columns.loc[:,7]
mnth_8=mnth_columns.loc[:,8]
mnth_9=mnth_columns.loc[:,9]
mnth_10=mnth_columns.loc[:,10]
mnth_11=mnth_columns.loc[:,11]
mnth_12=mnth_columns.loc[:,12]
df_season_mod=pd.concat([df_season_mod,mnth_1,mnth_2,mnth_3,mnth_4,
                         mnth_5,mnth_6,mnth_7,mnth_8,mnth_9,mnth_10,mnth_11,mnth_12],axis=1)
df_season_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth,hr,holiday,weekday,workingday,weathersit,...,3,4,5,6,7,8,9,10,11,12
0,1,0,0,0,1,0,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,1,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,2,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,3,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,4,0,6,0,1,...,0,0,0,0,0,0,0,0,0,0


In [28]:
df_season_mod.columns.values

array(['season_1', 'season_2', 'season_3', 'season_4', 'mnth', 'hr',
       'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
       'hum', 'windspeed', 'casual', 'registered', 'cnt', 1, 2, 3, 4, 5,
       6, 7, 8, 9, 10, 11, 12], dtype=object)

In [29]:
df_season_mod=df_season_mod.drop(['mnth'],axis=1)
column_names=['season_1', 'season_2', 'season_3', 'season_4','hr',
       'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
'hum', 'windspeed', 'casual', 'registered', 'cnt', 'mnth_1','mnth_2','mnth_3','mnth_4',
            'mnth_5','mnth_6','mnth_7','mnth_8','mnth_9','mnth_10','mnth_11','mnth_12']
df_season_mod.columns=column_names
column_names_reordered=['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1','mnth_2',
            'mnth_3','mnth_4','mnth_5','mnth_6','mnth_7','mnth_8','mnth_9','mnth_10',
           'mnth_11','mnth_12', 'hr',
        'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp',
           'hum', 'windspeed', 'casual', 'registered', 'cnt']

In [30]:
df_season_mod=df_season_mod[column_names_reordered]
df_season_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,0,1,0,0,0,0,0,...,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,0,0,0,1,0,0,0,0,0,...,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,0,0,0,1,0,0,0,0,0,...,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,1,0,0,0,1,0,0,0,0,0,...,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,1,0,0,0,1,0,0,0,0,0,...,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Creating third checkpoint after "mnth" modification

In [47]:
df_season_mnth_mod=df_season_mod.copy()

# The next feature : "Weekday"

In [48]:
df_season_mnth_mod['weekday'].unique()

array([6, 0, 1, 2, 3, 4, 5])

In [49]:
df_season_mnth_mod['weekday'].value_counts()

6    2512
0    2502
5    2487
1    2479
3    2475
4    2471
2    2453
Name: weekday, dtype: int64

In [50]:
weekday_columns=pd.get_dummies(df['weekday'])
weekday_columns.shape

(17379, 7)

In [51]:
weekday_columns['check']=weekday_columns.sum(axis=1)
weekday_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,check
0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,1,1
2,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,1
4,0,0,0,0,0,0,1,1


In [52]:
weekday_columns['check'].sum(axis=0)

17379

In [53]:
weekday_columns=weekday_columns.drop(['check'],axis=1)

In [54]:
weekday_columns.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [55]:
weekday_1=weekday_columns.loc[:,0]
weekday_2=weekday_columns.loc[:,1]
weekday_3=weekday_columns.loc[:,2]
weekday_4=weekday_columns.loc[:,3]
weekday_5=weekday_columns.loc[:,4]
weekday_6=weekday_columns.loc[:,5]
weekday_7=weekday_columns.loc[:,6]
df_season_mnth_mod=pd.concat([df_season_mnth_mod,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,
                             weekday_7],axis=1)
df_season_mnth_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,casual,registered,cnt,0,1,2,3,4,5,6
0,1,0,0,0,1,0,0,0,0,0,...,3,13,16,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,...,8,32,40,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,5,27,32,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0,0,0,...,3,10,13,0,0,0,0,0,0,1
4,1,0,0,0,1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1


In [56]:
df_season_mnth_mod=df_season_mnth_mod.drop(['weekday'],axis=1)
df_season_mnth_mod.columns.values

array(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hr', 'holiday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 0, 1, 2, 3, 4, 5, 6], dtype=object)

In [57]:

column_names=['season_1','season_2','season_3','season_4','mnth_1','mnth_2',
       'mnth_3','mnth_4','mnth_5','mnth_6','mnth_7','mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hr','holiday',
        'workingday', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'casual', 'registered', 'cnt','weekday_1','weekday_2',
              'weekday_3','weekday_4','weekday_5','weekday_6','weekday_7']
df_season_mnth_mod.columns=column_names

df_season_mnth_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,casual,registered,cnt,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1,0,0,0,1,0,0,0,0,0,...,3,13,16,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,...,8,32,40,0,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,0,...,5,27,32,0,0,0,0,0,0,1
3,1,0,0,0,1,0,0,0,0,0,...,3,10,13,0,0,0,0,0,0,1
4,1,0,0,0,1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,1


In [58]:
column_names_reordered=['season_1','season_2','season_3','season_4','mnth_1','mnth_2',
            'mnth_3','mnth_4','mnth_5','mnth_6','mnth_7','mnth_8','mnth_9','mnth_10',
           'mnth_11','mnth_12','hr','holiday','weekday_1','weekday_2',
              'weekday_3','weekday_4','weekday_5','weekday_6','weekday_7','workingday','weathersit','temp','atemp',
           'hum','windspeed','casual','registered','cnt']
df_season_mnth_mod=df_season_mnth_mod[column_names_reordered]
df_season_mnth_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weekday_7,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.75,0.0,0,1,1


Creating checkpoint 


In [104]:
df_season_mnth_weekday_mod=df_season_mnth_mod.copy()

# The next feature : "hr"

In [105]:
df_season_mnth_weekday_mod.shape

(17379, 34)

In [106]:
df_season_mnth_weekday_mod['hr'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23])

In [107]:
df_season_mnth_weekday_mod['hr'].value_counts()

16    730
17    730
15    729
13    729
14    729
22    728
18    728
19    728
20    728
21    728
23    728
12    728
7     727
8     727
9     727
10    727
11    727
0     726
6     725
1     724
5     717
2     715
4     697
3     697
Name: hr, dtype: int64

# In all hours of a day we have considerable renting cases so we continue to one-hot-coding for "hr" without clastering 

In [108]:
hours_columns=pd.get_dummies(df_season_mnth_weekday_mod['hr'])
hours_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [109]:
hours_columns.shape

(17379, 24)

In [110]:
hours_columns['check']=hours_columns.sum(axis=1)

In [111]:
hours_columns.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,check
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [112]:
hours_columns['check'].sum(axis=0)

17379

In [113]:
hours_columns=hours_columns.drop(['check'],axis=1)

When we use dummy variables then the problem would be numerous columns, we are interested to reduce this huge number of columns therefore we use grouping and classifying technique to summerize some of these columns<br>
Regarding numerous columns we can split the hours of day into 3 groups :first group 0-7,second group 8-15,third group 16-23 but as you notices in previous step in all hours a day we have large instances so we can not squeeze 
the number of columns now :-(

In [114]:
hour_0=hours_columns.loc[:,0]
hour_1=hours_columns.loc[:,1]
hour_2=hours_columns.loc[:,2]
hour_3=hours_columns.loc[:,3]
hour_4=hours_columns.loc[:,4]
hour_5=hours_columns.loc[:,5]
hour_6=hours_columns.loc[:,6]
hour_7=hours_columns.loc[:,7]
hour_8=hours_columns.loc[:,8]
hour_9=hours_columns.loc[:,9]
hour_10=hours_columns.loc[:,10]
hour_11=hours_columns.loc[:,11]
hour_12=hours_columns.loc[:,12]
hour_13=hours_columns.loc[:,13]
hour_14=hours_columns.loc[:,14]
hour_15=hours_columns.loc[:,15]
hour_16=hours_columns.loc[:,16]
hour_17=hours_columns.loc[:,17]
hour_18=hours_columns.loc[:,18]
hour_19=hours_columns.loc[:,19]
hour_20=hours_columns.loc[:,20]
hour_21=hours_columns.loc[:,21]
hour_22=hours_columns.loc[:,22]
hour_23=hours_columns.loc[:,23]

df_season_mnth_weekday_mod=pd.concat([df_season_mnth_weekday_mod,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5
                                     ,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14
                        ,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23],axis=1)
df_season_mnth_weekday_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,14,15,16,17,18,19,20,21,22,23
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
df_season_mnth_weekday_mod=df_season_mnth_weekday_mod.drop(['hr'],axis=1)
df_season_mnth_weekday_mod.columns.values

array(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'holiday', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
       'weekday_7', 'workingday', 'weathersit', 'temp', 'atemp', 'hum',
       'windspeed', 'casual', 'registered', 'cnt', 0, 1, 2, 3, 4, 5, 6, 7,
       8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
      dtype=object)

In [116]:
column_names=['season_1','season_2','season_3','season_4','mnth_1','mnth_2',
            'mnth_3','mnth_4','mnth_5','mnth_6','mnth_7','mnth_8','mnth_9','mnth_10',
           'mnth_11','mnth_12','holiday','weekday_1','weekday_2',
              'weekday_3','weekday_4','weekday_5','weekday_6','weekday_7','workingday','weathersit','temp','atemp',
           'hum','windspeed','casual','registered','cnt','hour_0','hour_1','hour_2','hour_3','hour_4','hour_5'
                                     ,'hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13'
            ,'hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23']
df_season_mnth_weekday_mod.columns=column_names
df_season_mnth_weekday_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [117]:
column_names_reordered=['season_1','season_2','season_3','season_4','mnth_1','mnth_2',
            'mnth_3','mnth_4','mnth_5','mnth_6','mnth_7','mnth_8','mnth_9','mnth_10',
           'mnth_11','mnth_12','hour_0','hour_1','hour_2','hour_3','hour_4','hour_5'
            ,'hour_6','hour_7','hour_8','hour_9','hour_10','hour_11','hour_12','hour_13'
            ,'hour_14','hour_15','hour_16','hour_17','hour_18','hour_19','hour_20','hour_21','hour_22','hour_23','holiday','weekday_1','weekday_2',
              'weekday_3','weekday_4','weekday_5','weekday_6','weekday_7','workingday','weathersit','temp','atemp',
           'hum','windspeed','casual','registered','cnt']
df_season_mnth_weekday_mod=df_season_mnth_weekday_mod[column_names_reordered]
df_season_mnth_weekday_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,weekday_7,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,1,0,0,0,1,0,0,0,0,0,...,1,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Creating another checkpoint after successfully modification of 'hr'

In [118]:
df_season_mnth_weekday_hours_mod=df_season_mnth_weekday_mod.copy()

# The next feature : "weathersit"

In [119]:
df_season_mnth_weekday_hours_mod['weathersit'].unique()

array([1, 2, 3, 4])

In [120]:
df_season_mnth_weekday_hours_mod['weathersit'].value_counts()

1    11413
2     4544
3     1419
4        3
Name: weathersit, dtype: int64

In [121]:
weather_columns=pd.get_dummies(df_season_mnth_weekday_hours_mod['weathersit'])
weather_columns.head()

Unnamed: 0,1,2,3,4
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [122]:
weather_columns['check']=weather_columns.sum(axis=1)
weather_columns.head()

Unnamed: 0,1,2,3,4,check
0,1,0,0,0,1
1,1,0,0,0,1
2,1,0,0,0,1
3,1,0,0,0,1
4,1,0,0,0,1


In [123]:
weather_columns=weather_columns.drop(['check'],axis=1)
weather_0=hours_columns.loc[:,1]
weather_1=hours_columns.loc[:,2]
weather_2=hours_columns.loc[:,3]
weather_3=hours_columns.loc[:,4]

In [124]:
df_season_mnth_weekday_hours_mod=df_season_mnth_weekday_hours_mod.drop(['weathersit'],axis=1)
df_season_mnth_weekday_hours_mod.columns.values

array(['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_21', 'hour_22', 'hour_23', 'holiday', 'weekday_1',
       'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
       'weekday_7', 'workingday', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt'], dtype=object)

In [125]:
df_season_mnth_weekday_hours_mod=pd.concat([df_season_mnth_weekday_hours_mod,weather_0,
                                            weather_1,weather_2,weather_3],axis=1)
df_season_mnth_weekday_hours_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,atemp,hum,windspeed,casual,registered,cnt,1,2,3,4
0,1,0,0,0,1,0,0,0,0,0,...,0.2879,0.81,0.0,3,13,16,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0.2727,0.8,0.0,8,32,40,1,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0.2727,0.8,0.0,5,27,32,0,1,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0.2879,0.75,0.0,3,10,13,0,0,1,0
4,1,0,0,0,1,0,0,0,0,0,...,0.2879,0.75,0.0,0,1,1,0,0,0,1


In [126]:
columns=['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'holiday', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weekday_7', 'workingday', 'temp',
       'atemp', 'hum','windspeed','casual', 'registered', 'cnt','weather_0',
                                            'weather_1','weather_2','weather_3']
df_season_mnth_weekday_hours_mod.columns=columns
reordered_columns=['season_1', 'season_2', 'season_3', 'season_4', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8',
       'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15','hour_16', 'hour_17',
       'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23',
       'holiday', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'weekday_7', 'workingday', 'temp',
       'atemp', 'hum','weather_0',
        'weather_1','weather_2','weather_3','casual', 'registered', 'cnt']
df_season_mnth_weekday_hours_mod=df_season_mnth_weekday_hours_mod[reordered_columns]
df_season_mnth_weekday_hours_mod.head()

Unnamed: 0,season_1,season_2,season_3,season_4,mnth_1,mnth_2,mnth_3,mnth_4,mnth_5,mnth_6,...,temp,atemp,hum,weather_0,weather_1,weather_2,weather_3,casual,registered,cnt
0,1,0,0,0,1,0,0,0,0,0,...,0.24,0.2879,0.81,0,0,0,0,3,13,16
1,1,0,0,0,1,0,0,0,0,0,...,0.22,0.2727,0.8,1,0,0,0,8,32,40
2,1,0,0,0,1,0,0,0,0,0,...,0.22,0.2727,0.8,0,1,0,0,5,27,32
3,1,0,0,0,1,0,0,0,0,0,...,0.24,0.2879,0.75,0,0,1,0,3,10,13
4,1,0,0,0,1,0,0,0,0,0,...,0.24,0.2879,0.75,0,0,0,1,0,1,1


# Final checkpoint

In [127]:
df_preprocessed=df_season_mnth_weekday_hours_mod.copy()

# Now it turns out our data frame is ready for statistical Anlaysis,and we export our datafarame as a csv file

In [128]:
df_preprocessed.to_csv('bike_sharing_preprocessed.csv', index=False)

End of Preprocessing