# Bikes Analysis Dataset
## Purpose
> A dataset containing all the required fields to build AI/ML models to find the profit 
## Description
> ### --> 10886 records of bikes dataset, containing:

        -   Datetime
        -   Season
        -   Holiday
        -   Workingday
        -   Weather
        -   Temp
        -   Humidity
        -   Windspeed
        -   Casual
        -   Registered
        -   Rented_bikes_count

In [2]:
import numpy as np
import pandas as pd

In [5]:
raw_df=pd.read_csv("../data/raw/bikes.csv")
df=raw_df.copy()

In [7]:
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,rented_bikes_count
0,2011-01-01 00:00:00,Spring,0.0,0.0,Clear,9.84,81.0,,3,13,16
1,2011-01-01 01:00:00,Spring,0.0,0.0,,9.02,80.0,0.0,8,32,40
2,2011-01-01 02:00:00,Spring,0.0,0.0,Clear,9.02,,0.0,5,27,32
3,2011-01-01 03:00:00,Spring,0.0,0.0,Clear,9.84,75.0,0.0,3,10,13
4,2011-01-01 04:00:00,,0.0,0.0,Clear,,75.0,,0,1,1


# EDA (Exploratory Data Analyst)

In [8]:
df.shape

(10886, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   datetime            10886 non-null  object 
 1   season              10672 non-null  object 
 2   holiday             10030 non-null  float64
 3   workingday          9388 non-null   float64
 4   weather             8746 non-null   object 
 5   temp                8104 non-null   float64
 6   humidity            7462 non-null   float64
 7   windspeed           6820 non-null   float64
 8   casual              10886 non-null  int64  
 9   registered          10886 non-null  int64  
 10  rented_bikes_count  10886 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 935.6+ KB


In [13]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
holiday,10030.0,0.029113,0.168131,0.0,0.0,0.0,0.0,1.0
workingday,9388.0,0.678206,0.467189,0.0,0.0,1.0,1.0,1.0
temp,8104.0,20.317665,7.818568,0.82,13.94,20.5,26.24,41.0
humidity,7462.0,61.790673,19.262084,0.0,47.0,62.0,77.0,100.0
windspeed,6820.0,12.708806,8.131154,0.0,7.0015,12.998,16.9979,56.9969
casual,10886.0,36.021955,49.960477,0.0,4.0,17.0,49.0,367.0
registered,10886.0,155.552177,151.039033,0.0,36.0,118.0,222.0,886.0
rented_bikes_count,10886.0,191.574132,181.144454,1.0,42.0,145.0,284.0,977.0


In [14]:
df.isnull().sum()

datetime                 0
season                 214
holiday                856
workingday            1498
weather               2140
temp                  2782
humidity              3424
windspeed             4066
casual                   0
registered               0
rented_bikes_count       0
dtype: int64

# Data Cleaning

### Cleaning of Holiday & Workingday

In [16]:
def map_holiday(x):
    if str(x['holiday'])=='nan' and str(x['workingday'])!='nan':
        if x['workingday']==0:
            return 1
        else:
            return 0
    else:
        return x['holiday']
    
df['holiday']=df[['holiday','workingday']].apply(map_holiday,axis=1)

In [17]:
def map_workingday(x):
    if str(x['workingday'])=='nan' and str(x['holiday'])!='nan':
        if x['holiday']==0:
            return 1
        else:
            return 0
    else:
        return x['workingday']
    
df['workingday']=df[['holiday','workingday']].apply(map_workingday,axis=1)

In [18]:
df.isnull().sum()

datetime                 0
season                 214
holiday                120
workingday             120
weather               2140
temp                  2782
humidity              3424
windspeed             4066
casual                   0
registered               0
rented_bikes_count       0
dtype: int64

In [19]:
df[df.holiday.isnull()==True]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,rented_bikes_count
17,2011-01-01 17:00:00,Spring,,,Mist,18.04,82.0,19.0012,15,52,67
273,2011-01-12 20:00:00,,,,Clear,6.56,50.0,,2,54,56
307,2011-01-14 07:00:00,Spring,,,Clear,,,11.0014,0,70,70
308,2011-01-14 08:00:00,Spring,,,Clear,4.92,68.0,,2,156,158
535,2011-02-05 11:00:00,Spring,,,Rainy,9.02,100.0,8.9981,1,46,47
...,...,...,...,...,...,...,...,...,...,...,...
10404,2012-11-18 22:00:00,Winter,,,Mist,14.76,66.0,,8,66,74
10583,2012-12-07 09:00:00,Winter,,,Mist,12.30,81.0,8.9981,10,291,301
10585,2012-12-07 11:00:00,Winter,,,Mist,13.12,,12.9980,20,183,203
10782,2012-12-15 16:00:00,Winter,,,,16.40,,8.9981,85,328,413


In [20]:
df.datetime=pd.to_datetime(df['datetime'])

In [21]:
df['year']=df.datetime.dt.year
df['month']=df.datetime.dt.month
df['day']=df['datetime'].dt.day_name()

In [22]:
def day_holiday(x):
    if x['day']=='Saturday' or x['day']=='Sunday':
        return 1 
    else:
        return 0
    
df['holiday']=df[['holiday','day']].apply(day_holiday,axis=1)

In [23]:
df['workingday']=df[['holiday','workingday']].apply(map_workingday,axis=1)

In [24]:
df[df.holiday.isnull()==True]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,casual,registered,rented_bikes_count,year,month,day


In [25]:
df.isnull().sum()

datetime                 0
season                 214
holiday                  0
workingday               0
weather               2140
temp                  2782
humidity              3424
windspeed             4066
casual                   0
registered               0
rented_bikes_count       0
year                     0
month                    0
day                      0
dtype: int64

In [26]:
def map_season(x):
    if str(x['season'])=='nan':
        if x['month'] in ([12,1,2]):
            return 'Winter'
        elif x['month'] in ([3,4,5]):
            return 'Spring'
        elif x['month'] in ([6,7,8]):
            return 'Summer'
        else:
            return 'Fall'
    else:
        return x['season']
    
df['season']=df.apply(map_season,axis=1)

In [27]:
df[['season','holiday','workingday','month','day']]

Unnamed: 0,season,holiday,workingday,month,day
0,Spring,1,0.0,1,Saturday
1,Spring,1,0.0,1,Saturday
2,Spring,1,0.0,1,Saturday
3,Spring,1,0.0,1,Saturday
4,Winter,1,0.0,1,Saturday
...,...,...,...,...,...
10881,Winter,0,1.0,12,Wednesday
10882,Winter,0,1.0,12,Wednesday
10883,Winter,0,1.0,12,Wednesday
10884,Winter,0,1.0,12,Wednesday


In [28]:
def season(x):
    if x['month'] in ([12,1,2]):
        return 'Winter'
    elif x['month'] in ([3,4,5]):
        return 'Spring'
    elif x['month'] in ([6,7,8]):
        return 'Summer'
    else:
        return 'Fall'
    
df['season']=df.apply(season,axis=1)

In [29]:
df.isnull().sum()

datetime                 0
season                   0
holiday                  0
workingday               0
weather               2140
temp                  2782
humidity              3424
windspeed             4066
casual                   0
registered               0
rented_bikes_count       0
year                     0
month                    0
day                      0
dtype: int64

In [30]:
df.dropna(axis=0,inplace=True)

In [31]:
df.isnull().sum()

datetime              0
season                0
holiday               0
workingday            0
weather               0
temp                  0
humidity              0
windspeed             0
casual                0
registered            0
rented_bikes_count    0
year                  0
month                 0
day                   0
dtype: int64

In [33]:
df.to_csv('../data/processed/EDA & Cleaning.csv')