### Impliment the Data Quality Checks

In [1]:
import pandas as pd
df=pd.read_csv("chicago_bikes.csv")

In [2]:
df.head(2)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,9080545,3/31/2016 23:30,3/31/2016 23:46,2295,926,156,Clark St & Wellington Ave,166,Ashland Ave & Wrightwood Ave,Subscriber,Male,1990.0
1,9080521,3/31/2016 22:59,3/31/2016 23:02,3439,198,259,California Ave & Francis Pl,276,California Ave & North Ave,Subscriber,Male,1974.0


In [3]:
## Convert the tripduration to minutes
df['tripduration'] = df['tripduration']/60

In [4]:
## There were some extreme values in the tripduration column
df['tripduration'].describe()

count    72131.000000
mean        16.563629
std         32.848301
min          1.000000
25%          6.800000
50%         11.683333
75%         19.566667
max       1439.416667
Name: tripduration, dtype: float64

In [5]:
## Check the distribution at higher end of the tail
df['tripduration'].quantile(0.99)

89.68499999999986

In [6]:
df['tripduration'].quantile(0.999)

333.6881666667107

In [7]:
df[df['tripduration']>333].shape[0] ## these instances aren't very frequent, 
## but is this usage more prominent among a certain type of users

73

In [8]:
df[df['tripduration']>333]['usertype'].value_counts(normalize=True) ## These are probably instances where bikes are used for long rides

Customer      0.534247
Subscriber    0.465753
Name: usertype, dtype: float64

In [9]:
df['usertype'].value_counts(normalize=True)

Subscriber    0.762252
Customer      0.237748
Name: usertype, dtype: float64

In [10]:
## Are extreme observations changing the mean of alot?
df[df['tripduration']<333]['tripduration'].mean()

15.734538843709236

In [11]:
df['tripduration'].mean() ## We need not worry about the few extreme observations

16.563629368787343

In [12]:
## Examine the birthyear column
df['birthyear'].describe()

count    54986.000000
mean      1980.423799
std         10.823393
min       1899.000000
25%       1975.000000
50%       1984.000000
75%       1989.000000
max       2000.000000
Name: birthyear, dtype: float64

In [13]:
## Lets look at the lower end of the tail
df['birthyear'].quantile(0.1)

1963.0

In [17]:
df['birthyear'].quantile(0.01) ## one can have very old people in the data

1951.0

In [18]:
## Missing Values
df['gender'].isnull().sum()

17154

In [19]:
df['birthyear'].isnull().sum()

17145

In [20]:
## Check the users who've not specified gender and birthyear are same?
df[df['gender'].isnull()].head(2)

Unnamed: 0,trip_id,starttime,stoptime,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
23,9079139,3/31/2016 18:43,3/31/2016 19:05,4232,21.816667,85,Michigan Ave & Oak St,210,Ashland Ave & Division St,Customer,,
39,9078000,3/31/2016 17:35,3/31/2016 17:58,1997,22.316667,2,Michigan Ave & Balbo Ave,6,Dusable Harbor,Customer,,


In [21]:
df[df['gender'].isnull()].shape

(17154, 12)

In [22]:
df[df['gender'].isnull()]['birthyear'].isnull().sum()

17145

In [23]:
## So the users who've not specified gender have also not specified birthyear.Are these special types of users?
df[df['gender'].isnull()]['usertype'].value_counts(normalize=True)

Customer      0.999475
Subscriber    0.000525
Name: usertype, dtype: float64

In [24]:
df['usertype'].value_counts(normalize=True)

Subscriber    0.762252
Customer      0.237748
Name: usertype, dtype: float64

In [25]:
## What can we do? We can specify these users with a special flag
df['gender'].fillna('missing',inplace=True)

In [26]:
## There is no point imputing the missing values here for birthyear. When we do the analysis we will need to drop
## these observations
df.to_csv('cleaned_bikes.csv',index=False)