In [None]:
# Import necessary libraries

import os
import datetime
import string

import pandas as pd
import numpy as np

import sqlalchemy


In [3]:
# I will create an empty dataframe to store bike rental data    
df_bike = pd.DataFrame()

In [4]:
# Create a unified dataframe for bike rental data
for f in sorted(os.listdir('data')):
    if f.startswith('JC'):
        df_temp = pd.read_csv('data/' + f)

        df_bike = pd.concat([df_bike, df_temp], axis=0)

# Reset index after concatenation
df_bike.reset_index(drop=True, inplace=True)

# add an ID column. Purpose: this will be useful for counts and eventually a primary key
df_bike['id'] = df_bike.index 

In [5]:
# Lets inspect the dataframe
df_bike.head(5)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,id
0,362,2016-01-01 00:02:52,2016-01-01 00:08:54,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24647,Subscriber,1964.0,2,0
1,200,2016-01-01 00:18:22,2016-01-01 00:21:42,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24605,Subscriber,1962.0,1,1
2,202,2016-01-01 00:18:25,2016-01-01 00:21:47,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24689,Subscriber,1962.0,2,2
3,248,2016-01-01 00:23:13,2016-01-01 00:27:21,3209,Brunswick St,40.724176,-74.050656,3203,Hamilton Park,40.727596,-74.044247,24693,Subscriber,1984.0,1,3
4,903,2016-01-01 01:03:20,2016-01-01 01:18:24,3195,Sip Ave,40.730743,-74.063784,3210,Pershing Field,40.742677,-74.051789,24573,Customer,,0,4


In [6]:
df_bike.tail(5)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,id
247579,557,2016-12-31 23:10:16,2016-12-31 23:19:33,3214,Essex Light Rail,40.712774,-74.036486,3203,Hamilton Park,40.727596,-74.044247,24465,Subscriber,1981.0,2,247579
247580,2749,2016-12-31 23:29:39,2017-01-01 00:15:29,3183,Exchange Place,40.716247,-74.033459,3183,Exchange Place,40.716247,-74.033459,24389,Customer,,0,247580
247581,173,2016-12-31 23:44:37,2016-12-31 23:47:31,3186,Grove St PATH,40.719586,-74.043117,3270,Jersey & 6th St,40.725289,-74.045572,24641,Subscriber,1978.0,1,247581
247582,2424,2016-12-31 23:44:50,2017-01-01 00:25:14,3214,Essex Light Rail,40.712774,-74.036486,3214,Essex Light Rail,40.712774,-74.036486,26219,Subscriber,1960.0,2,247582
247583,2419,2016-12-31 23:44:50,2017-01-01 00:25:10,3214,Essex Light Rail,40.712774,-74.036486,3214,Essex Light Rail,40.712774,-74.036486,24471,Subscriber,1956.0,1,247583


From a first checking of the data dictionary and observation of df_bike.tail:
- There are about 250k records (this can be read from the index).
- Checking the data dictionary:
    - `Trip duration` is in seconds.
    - `Gender` can be `0=unknown`, `1=male`, `2=female`.
    - `User Type` has values `Customer=24 hour pass or 3 day user` and `Subscriber=Annual Member`

Let´s explore by looking at numeric fields using .describe().

In [7]:
df_bike.describe()

Unnamed: 0,Trip Duration,Start Station ID,Start Station Latitude,Start Station Longitude,End Station ID,End Station Latitude,End Station Longitude,Bike ID,Birth Year,Gender,id
count,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,247584.0,228585.0,247584.0,247584.0
mean,885.6305,3207.065206,40.723121,-74.046438,3203.572553,40.722594,-74.045855,24935.260481,1979.335276,1.123534,123791.5
std,35937.98,26.955103,0.008199,0.011211,61.579494,0.007958,0.011283,748.469712,9.596809,0.518687,71471.488861
min,61.0,3183.0,40.69264,-74.096937,147.0,40.692216,-74.096937,14552.0,1900.0,0.0,0.0
25%,248.0,3186.0,40.717732,-74.050656,3186.0,40.71654,-74.050444,24491.0,1974.0,1.0,61895.75
50%,390.0,3201.0,40.721525,-74.044247,3199.0,40.721124,-74.043117,24609.0,1981.0,1.0,123791.5
75%,666.0,3211.0,40.727596,-74.038051,3211.0,40.727224,-74.036486,24719.0,1986.0,1.0,185687.25
max,16329810.0,3426.0,40.752559,-74.032108,3426.0,40.801343,-73.95739,27274.0,2000.0,2.0,247583.0


Some observations:
- `Trip Duration` falls from the 75th percentile to maximum in five orders.
- `Birth Year` has a minimum value of 1900, which would correspond to an age of 116.

Let´s check if there are missing values or duplicated rows

In [8]:
# checking for missing values
df_bike.isna().sum()

Trip Duration                  0
Start Time                     0
Stop Time                      0
Start Station ID               0
Start Station Name             0
Start Station Latitude         0
Start Station Longitude        0
End Station ID                 0
End Station Name               0
End Station Latitude           0
End Station Longitude          0
Bike ID                        0
User Type                    380
Birth Year                 18999
Gender                         0
id                             0
dtype: int64

In [11]:
duplicates = df_bike.duplicated().sum()
print(duplicates)

0


We´ll need to investigate both User Type and Birth Year to see what data is missing. We´ll also want to look into gender, there´s no missing data, but we know from the dictionary that 0 corresponds to `Unknown`.


In [12]:
df_bike.dtypes

Trip Duration                int64
Start Time                  object
Stop Time                   object
Start Station ID             int64
Start Station Name          object
Start Station Latitude     float64
Start Station Longitude    float64
End Station ID               int64
End Station Name            object
End Station Latitude       float64
End Station Longitude      float64
Bike ID                      int64
User Type                   object
Birth Year                 float64
Gender                       int64
id                           int64
dtype: object

For now, let´s finish initial exploration by checking data types (below). It looks like `Start Time` and `Stop Time` need to be datetimes, which we can fix now. Other incorrect data types, like `Birth Year`, will have to wait until we look at NaNs.

In [14]:
# Let´s cast object types to timestamps
df_bike['Start Time'] = pd.to_datetime(df_bike['Start Time'])  ##, infer_datetime_format=True) -- This no longer affects the parsing as of pandas 1.3.0
df_bike['Stop Time'] = pd.to_datetime(df_bike['Stop Time'])    ##, infer_datetime_format=True)
df_bike.dtypes

Trip Duration                       int64
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                    int64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                      int64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth Year                        float64
Gender                              int64
id                                  int64
dtype: object

## Cleaning and Transforming Citibike Data

Columns to investigate further:
- `Birth Year` (suspiciously small minimum, missing data)
- `User Type` (missing data)
- `Trip Duration` (suspiciously large maximum)
- `Gender` (what are the unknowns)


First some housekeeping to remove spaces and uppercase letters from the column names.