In [1]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
dec = pd.read_csv("Resources/original_data/201912-citibike-tripdata.csv")
jan = pd.read_csv("Resources/original_data/202001-citibike-tripdata.csv")

In [3]:
dec["period"] = "Dec-19"
jan["period"] = "Jan-20"
seasonal_df = pd.concat([dec, jan], ignore_index=True)
print(seasonal_df.shape)
seasonal_df.head()

(2195806, 16)


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,period
0,602,2019-12-01 00:00:05.5640,2019-12-01 00:10:07.8180,3382,Carroll St & Smith St,40.680611,-73.994758,3304,6 Ave & 9 St,40.668127,-73.983776,41932,Subscriber,1970,1,Dec-19
1,1206,2019-12-01 00:00:10.9630,2019-12-01 00:20:17.8820,362,Broadway & W 37 St,40.751726,-73.987535,500,Broadway & W 51 St,40.762288,-73.983362,18869,Customer,1999,1,Dec-19
2,723,2019-12-01 00:00:11.8180,2019-12-01 00:12:14.8310,146,Hudson St & Reade St,40.71625,-74.009106,238,Bank St & Washington St,40.736197,-74.008592,15334,Subscriber,1997,1,Dec-19
3,404,2019-12-01 00:00:12.2200,2019-12-01 00:06:56.8860,3834,Irving Ave & Halsey St,40.69467,-73.90663,3827,Halsey St & Broadway,40.68565,-73.91564,41692,Customer,1995,1,Dec-19
4,1059,2019-12-01 00:00:14.7230,2019-12-01 00:17:54.1860,500,Broadway & W 51 St,40.762288,-73.983362,3323,W 106 St & Central Park West,40.798186,-73.960591,40156,Subscriber,1961,1,Dec-19


## Clean data 1: check NA values

In [4]:
seasonal_df.isna().sum()

tripduration               0
starttime                  0
stoptime                   0
start station id           0
start station name         0
start station latitude     0
start station longitude    0
end station id             0
end station name           0
end station latitude       0
end station longitude      0
bikeid                     0
usertype                   0
birth year                 0
gender                     0
period                     0
dtype: int64

## Clean data 2: ensure station id/name/geocode consistent

In [5]:
# check start station first
seasonal_df[["start station id", "start station name","start station latitude", "start station longitude"]].nunique()

start station id           895
start station name         897
start station latitude     927
start station longitude    922
dtype: int64

In [6]:
# there is some inconsistency for the station info, and shows that for each station id, several geocodes availble
# below to check details:
start_station_df = seasonal_df.groupby("start station id")[["start station name","start station latitude", "start station longitude"]].agg({"start station name":pd.Series.nunique,"start station latitude": pd.Series.nunique,
                                   "start station longitude": pd.Series.nunique})
start_station_df

Unnamed: 0_level_0,start station name,start station latitude,start station longitude
start station id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
72,1,1,1
79,1,1,1
82,1,1,1
83,1,1,1
116,1,1,1
119,1,1,1
120,1,1,1
127,1,1,1
128,1,1,1
143,1,1,1


In [17]:
# find an exmaple to see how to deal with different lat & long
test = seasonal_df.loc[seasonal_df["start station id"] == 251]
test[["start station latitude", "start station longitude"]].value_counts()

start station latitude  start station longitude
40.72318                -73.9948                   5098
40.72200                -73.9950                      2
dtype: int64

In [22]:
# the above example shows that likely the fact of several geocodes may not impact the results
# loop through all stations with the same case to check whether i
# if no is not big, leave it as it is
def check_station(id):
    test = seasonal_df.loc[seasonal_df["start station id"] == id]
    print(f"For station {id}")
    print(test[["start station latitude", "start station longitude"]].value_counts())
    print("============================")

# get the full list 
check_df = start_station_df.loc[(start_station_df["start station latitude"] != 1) | (start_station_df["start station longitude"] != 1)]
check_list = check_df.index.tolist()

for station in check_list: 
    check_station(station)

For station 167
start station latitude  start station longitude
40.748901               -73.976049                 6282
40.749000               -73.977000                    1
dtype: int64
For station 251
start station latitude  start station longitude
40.72318                -73.9948                   5098
40.72200                -73.9950                      2
dtype: int64
For station 257
start station latitude  start station longitude
40.719392               -74.002472                 5089
40.719000               -74.001000                    1
dtype: int64
For station 267
start station latitude  start station longitude
40.750977               -73.987654                 3790
40.752000               -73.989000                    1
dtype: int64
For station 281
start station latitude  start station longitude
40.764397               -73.973715                 6257
40.764000               -73.974000                    1
dtype: int64
For station 285
start station latitude  start station l

start station latitude  start station longitude
40.73705                -73.990093                 13605
40.73700                -73.989000                     1
dtype: int64
For station 503
start station latitude  start station longitude
40.738274               -73.98752                  6453
40.737000               -73.98900                     2
dtype: int64
For station 504
start station latitude  start station longitude
40.732219               -73.981656                 8386
40.731000               -73.983000                    1
dtype: int64
For station 505
start station latitude  start station longitude
40.749013               -73.988484                 11826
40.749000               -73.989000                     1
dtype: int64
For station 508
start station latitude  start station longitude
40.763414               -73.996674                 4456
40.764000               -73.998000                    1
dtype: int64
For station 513
start station latitude  start station longitude
40.

For station 3606
start station latitude  start station longitude
40.74252                -73.948852                 644
40.74300                -73.950000                   1
dtype: int64
For station 3613
start station latitude  start station longitude
40.745038               -73.957539                 702
40.746000               -73.959000                   1
dtype: int64
For station 3630
start station latitude  start station longitude
40.803865               -73.955931                 1320
40.803000               -73.956000                    1
dtype: int64
For station 3656
start station latitude  start station longitude
40.723077               -73.985836                 7020
40.722000               -73.986000                    1
dtype: int64
For station 3668
start station latitude  start station longitude
40.723957               -73.949844                 2189
40.725000               -73.950000                    1
dtype: int64
For station 3690
start station latitude  start station

#### So we would leave the start station as it is, as it would not impact the final conclusion

In [23]:
# below for end station
seasonal_df[["end station id", "end station name","end station latitude", "end station longitude"]].nunique()

end station id           899
end station name         901
end station latitude     930
end station longitude    925
dtype: int64

In [26]:
# For end station, it is similar to start station, with roughly 20 more geocodes availble
# comparing to the scale of data, we also would assume it would not impact the final result

ana_data = seasonal_df.to_csv("Resources/ana_data.csv")