In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import glob
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
files = glob.glob('data/JC-2016*-citibike-tripdata.csv')
df_list = []
for filename in files: 
    data = pd.read_csv(filename)
    df_list.append(data)
df = pd.concat(df_list)    

In [5]:
df.dtypes

Trip Duration                int64
Start Time                  object
Stop Time                   object
Start Station ID             int64
Start Station Name          object
Start Station Latitude     float64
Start Station Longitude    float64
End Station ID               int64
End Station Name            object
End Station Latitude       float64
End Station Longitude      float64
Bike ID                      int64
User Type                   object
Birth Year                 float64
Gender                       int64
dtype: object

Start Time, Stop Time, and Birth Year seem like they should be timestamps and dates, but are objects. 

In [8]:
df.describe

<bound method NDFrame.describe of        Trip Duration           Start Time            Stop Time  \
0                361  2016-02-01 00:31:18  2016-02-01 00:37:19   
1                297  2016-02-01 01:55:05  2016-02-01 02:00:02   
2               1155  2016-02-01 02:40:05  2016-02-01 02:59:20   
3               1769  2016-02-01 05:11:28  2016-02-01 05:40:58   
4                935  2016-02-01 05:48:24  2016-02-01 06:03:59   
...              ...                  ...                  ...   
19483            249  2016-05-31 23:16:00  2016-05-31 23:20:10   
19484            650  2016-05-31 23:18:32  2016-05-31 23:29:23   
19485           2048  2016-05-31 23:25:28  2016-05-31 23:59:36   
19486            455  2016-05-31 23:31:57  2016-05-31 23:39:32   
19487            239  2016-05-31 23:47:38  2016-05-31 23:51:37   

       Start Station ID Start Station Name  Start Station Latitude  \
0                  3202       Newport PATH               40.727224   
1                  3195          

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247584 entries, 0 to 19487
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Trip Duration            247584 non-null  int64  
 1   Start Time               247584 non-null  object 
 2   Stop Time                247584 non-null  object 
 3   Start Station ID         247584 non-null  int64  
 4   Start Station Name       247584 non-null  object 
 5   Start Station Latitude   247584 non-null  float64
 6   Start Station Longitude  247584 non-null  float64
 7   End Station ID           247584 non-null  int64  
 8   End Station Name         247584 non-null  object 
 9   End Station Latitude     247584 non-null  float64
 10  End Station Longitude    247584 non-null  float64
 11  Bike ID                  247584 non-null  int64  
 12  User Type                247204 non-null  object 
 13  Birth Year               228585 non-null  float64
 14  Gender    

User Type has 380 null values, <br> and Birth Year has 18,999 null values

In [13]:
df.shape

(247584, 15)

In [15]:
df['Trip Duration'] = pd.to_timedelta(df['Trip Duration'], unit='s')
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])

In [17]:
df.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
0,0 days 00:06:01,2016-02-01 00:31:18,2016-02-01 00:37:19,3202,Newport PATH,40.727224,-74.033759,3203,Hamilton Park,40.727596,-74.044247,24393,Subscriber,1975.0,1
1,0 days 00:04:57,2016-02-01 01:55:05,2016-02-01 02:00:02,3195,Sip Ave,40.730743,-74.063784,3194,McGinley Square,40.72534,-74.067622,24394,Subscriber,1985.0,2
2,0 days 00:19:15,2016-02-01 02:40:05,2016-02-01 02:59:20,3183,Exchange Place,40.716247,-74.033459,3210,Pershing Field,40.742677,-74.051789,24676,Subscriber,1976.0,1
3,0 days 00:29:29,2016-02-01 05:11:28,2016-02-01 05:40:58,3214,Essex Light Rail,40.712774,-74.036486,3203,Hamilton Park,40.727596,-74.044247,24700,Subscriber,1974.0,2
4,0 days 00:15:35,2016-02-01 05:48:24,2016-02-01 06:03:59,3203,Hamilton Park,40.727596,-74.044247,3214,Essex Light Rail,40.712774,-74.036486,24639,Subscriber,1974.0,2


In [19]:
df.tail(20)

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
19468,0 days 00:05:23,2016-05-31 22:06:16,2016-05-31 22:11:39,3220,5 Corners Library,40.734961,-74.059503,3215,Central Ave,40.74673,-74.049251,24614,Subscriber,1983.0,1
19469,0 days 00:01:59,2016-05-31 22:06:37,2016-05-31 22:08:37,3202,Newport PATH,40.727224,-74.033759,3199,Newport Pkwy,40.728745,-74.032108,24482,Subscriber,1990.0,1
19470,0 days 00:42:26,2016-05-31 22:06:50,2016-05-31 22:49:16,3214,Essex Light Rail,40.712774,-74.036486,3214,Essex Light Rail,40.712774,-74.036486,24453,Subscriber,1988.0,2
19471,0 days 00:41:45,2016-05-31 22:07:13,2016-05-31 22:48:58,3214,Essex Light Rail,40.712774,-74.036486,3214,Essex Light Rail,40.712774,-74.036486,24429,Subscriber,1984.0,1
19472,0 days 00:02:59,2016-05-31 22:08:22,2016-05-31 22:11:22,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24448,Subscriber,1986.0,1
19473,0 days 00:06:18,2016-05-31 22:13:10,2016-05-31 22:19:29,3202,Newport PATH,40.727224,-74.033759,3199,Newport Pkwy,40.728745,-74.032108,24479,Subscriber,1988.0,2
19474,0 days 00:02:22,2016-05-31 22:22:33,2016-05-31 22:24:56,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,24598,Subscriber,1969.0,1
19475,0 days 00:05:12,2016-05-31 22:23:00,2016-05-31 22:28:12,3186,Grove St PATH,40.719586,-74.043117,3213,Van Vorst Park,40.718489,-74.047727,24651,Subscriber,1959.0,2
19476,0 days 00:04:35,2016-05-31 22:25:42,2016-05-31 22:30:18,3186,Grove St PATH,40.719586,-74.043117,3209,Brunswick St,40.724176,-74.050656,24552,Subscriber,1972.0,1
19477,0 days 00:37:32,2016-05-31 22:26:30,2016-05-31 23:04:02,3193,Lincoln Park,40.724605,-74.078406,3220,5 Corners Library,40.734961,-74.059503,24561,Subscriber,1987.0,1


In [21]:
birth_year_test = {'X': [1975.0, 1985.0, 1976.0, 1974.0, 1974.0, np.nan, 1990.0, 1988.0, 1984.0, 1986.0]}

In [31]:
df_test = pd.DataFrame(data = birth_year_test)

In [33]:
imp = IterativeImputer(max_iter=10, random_state=0)

In [35]:
imp.fit(df_test)

In [37]:
df['Birth Year'] = np.round(imp.fit_transform(df[['Birth Year']]), 1)

In [45]:
user_type = df['User Type'].unique()
user_type

array(['Subscriber', 'Customer', nan], dtype=object)

In [49]:
df = df.dropna()

In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247204 entries, 0 to 19487
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype          
---  ------                   --------------   -----          
 0   Trip Duration            247204 non-null  timedelta64[ns]
 1   Start Time               247204 non-null  datetime64[ns] 
 2   Stop Time                247204 non-null  datetime64[ns] 
 3   Start Station ID         247204 non-null  int64          
 4   Start Station Name       247204 non-null  object         
 5   Start Station Latitude   247204 non-null  float64        
 6   Start Station Longitude  247204 non-null  float64        
 7   End Station ID           247204 non-null  int64          
 8   End Station Name         247204 non-null  object         
 9   End Station Latitude     247204 non-null  float64        
 10  End Station Longitude    247204 non-null  float64        
 11  Bike ID                  247204 non-null  int64          
 12  User Typ