## Preprocessing

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
sns.set(rc={'figure.figsize':[7,7]},font_scale=1.2)

In [2]:
df=pd.read_pickle('processed_1.pickle')

In [3]:
df_train=pd.read_csv('training_data.csv')
df_test=pd.read_csv('testing_data.csv')

In [4]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month_name,month,day_of_week,hour,is_rush_hour,is_weekend
10378,2012-11-17 20:00:00,winter,0,0,Cloudy,13.94,17.425,66,6.0032,30,142,172,2012,November,11,Saturday,20,0,1
2815,2011-07-06 05:00:00,fall,0,1,Clear,27.88,31.82,83,6.0032,5,30,35,2011,July,7,Wednesday,5,0,0
8695,2012-08-04 16:00:00,fall,0,0,Clear,36.9,40.91,39,19.9995,197,253,450,2012,August,8,Saturday,16,1,1
9026,2012-08-18 11:00:00,fall,0,0,Clear,29.52,33.335,51,0.0,180,356,536,2012,August,8,Saturday,11,0,1
1543,2011-04-10 04:00:00,summer,0,0,Cloudy,14.76,18.18,93,7.0015,3,2,5,2011,April,4,Sunday,4,0,1


In [5]:
cat_features=['season', 'holiday', 'workingday', 'weather','hour' ,'month_name', 'day_of_week','is_rush_hour','is_weekend']

In [6]:
num_features=['temp','humidity','windspeed']

In [7]:
df=df[num_features+cat_features+['casual','registered']]

In [8]:
#correct dtypes
df['temp']=df['temp'].astype('float64')
df['humidity']=df['humidity'].astype('float64')
df['windspeed']=df['windspeed'].astype('float64')

In [9]:
#to ensure that all numeric features has number values without any errors 
for col in num_features : 
    df[col]=pd.to_numeric(df[col],errors='coerce')

In [10]:
#correct dtypes
for col in cat_features : 
    df[col]=df[col].astype('category')

In [11]:
#use 'unique' function to check any invalid value in categorical columns 
for col in cat_features : 
    print(f'>>> {col} <<<') 
    print(df[col].value_counts()) # or unique 
    print('================')

>>> season <<<
summer    2201
winter    2196
fall      2182
spring    2129
Name: season, dtype: int64
>>> holiday <<<
0    8458
1     250
Name: holiday, dtype: int64
>>> workingday <<<
1    5924
0    2784
Name: workingday, dtype: int64
>>> weather <<<
Clear     5757
Cloudy    2266
Snow       685
Name: weather, dtype: int64
>>> hour <<<
23    379
17    378
15    376
22    372
0     370
2     369
10    369
5     366
16    366
18    366
9     365
19    364
14    364
11    360
8     360
12    358
13    357
7     356
4     355
20    354
6     353
21    352
1     351
3     348
Name: hour, dtype: int64
>>> month_name <<<
November     749
May          745
August       737
June         729
July         728
April        727
October      727
February     726
December     720
September    717
March        710
January      693
Name: month_name, dtype: int64
>>> day_of_week <<<
Sunday       1269
Saturday     1265
Friday       1247
Tuesday      1244
Thursday     1231
Wednesday    1229
Monday       12

- Done we don't have any incorrect values !!

**Work With Missing Data**

In [12]:
df.isnull().sum()

temp            0
humidity        0
windspeed       0
season          0
holiday         0
workingday      0
weather         0
hour            0
month_name      0
day_of_week     0
is_rush_hour    0
is_weekend      0
casual          0
registered      0
dtype: int64

- Done we don't have any nan values !!

**working with categorical data**

In [13]:
ord_encoder=OrdinalEncoder()

In [14]:
categorical_transformer=Pipeline(steps=[
    ('ordinal_encoding', ord_encoder)])

**working with numerical data**

In [15]:
scaler=StandardScaler()

In [16]:
numeric_transformer=Pipeline(steps=[('scaling',scaler)])

**making pipeline**

In [17]:
preprocessor = make_column_transformer(
    (numeric_transformer, [0, 1, 2]),
    (categorical_transformer, [3, 4, 5, 6, 7, 8, 9, 10, 11]),
    remainder='passthrough'
)

**Split data**

In [18]:
X_train=df.drop(['casual','registered'],axis=1) 
y_train=df[['casual','registered']]

In [19]:
X_test=df_test.drop(['casual','registered','count'],axis=1) 
y_test=df_test[['casual','registered']]

In [20]:
X_train_trans=pd.DataFrame(preprocessor.fit_transform(X_train),columns=num_features+cat_features)

In [21]:
X_train_trans.head()

Unnamed: 0,temp,humidity,windspeed,season,holiday,workingday,weather,hour,month_name,day_of_week,is_rush_hour,is_weekend
0,-0.810848,0.218929,-0.834323,3.0,0.0,0.0,1.0,20.0,9.0,2.0,0.0,1.0
1,0.982374,1.100479,-0.834323,0.0,0.0,1.0,0.0,5.0,5.0,6.0,0.0,0.0
2,2.142695,-1.181181,0.868389,0.0,0.0,0.0,0.0,16.0,1.0,2.0,1.0,1.0
3,1.193342,-0.55891,-1.564639,0.0,0.0,0.0,0.0,11.0,1.0,2.0,0.0,1.0
4,-0.705364,1.619038,-0.712876,2.0,0.0,0.0,1.0,4.0,0.0,3.0,0.0,1.0


- now data is ready for machine learning model !.

In [22]:
pd.to_pickle(X_train_trans,'processed_2.pickle')