# Clean and transform Bikeshare data to more useful formats

In [8]:
import sys
import folium
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from folium.plugins import HeatMap

df = pd.read_csv('../data/mbike.csv')
print("DF size is " + str(len(df.index)))
df.head(5)


DF size is 1851924


Unnamed: 0,Coords_Latitude,Coords_Longitude,Trip_ID,Bike_Event,User_ID,Date_Time,Date,Time
0,38.978141,-76.928956,Hf3aj78RcGaJJMGRn,StartTrip,24Tswou857XKT9R65,2017-06-14T19:02:21.593Z,2017-06-14,19:02:21
1,38.9782,-76.928471,tp7QKuiJX9DvLKDcf,StartTrip,24Tswou857XKT9R65,2017-06-15T20:07:14.437Z,2017-06-15,20:07:14
2,38.972647,-76.938315,tp7QKuiJX9DvLKDcf,EndTripInsideGeofence,24Tswou857XKT9R65,2017-06-15T20:19:43.891Z,2017-06-15,20:19:43
3,38.978369,-76.928679,j7xEWoy65rDsJfk7M,StartTrip,24Tswou857XKT9R65,2017-06-17T22:25:55.592Z,2017-06-17,22:25:55
4,38.972581,-76.938422,j7xEWoy65rDsJfk7M,EndTripInsideGeofence,24Tswou857XKT9R65,2017-06-17T22:36:07.021Z,2017-06-17,22:36:07


## Cleaning Data 

### 1)Remove anomalies (i.e. not in region of interest)

In [9]:
# (Top right corner - Beltsville Powder Mill Rd & Edmonston Rd (39.031398, -76.899542) [BR]
# (Bottom Left corner - Near Prince George's Plaza(38.967931, -76.968837))
dfz = df.copy()
LatTR, LongTR = [39.031398, -76.899542]
LatBL, LongBL = [38.967931,-76.968837]

dfz = dfz[(dfz['Coords_Latitude'] <= LatTR) & (dfz['Coords_Latitude'] >= LatBL)]
dfz = dfz[(dfz['Coords_Longitude'] <= LongTR) & (dfz['Coords_Longitude'] >= LongBL)]
#Check the min/max to make sure they're reasonable now:
print(dfz['Coords_Longitude'].max(), dfz['Coords_Latitude'].max())
print(dfz['Coords_Longitude'].min(), dfz['Coords_Latitude'].min())
print(str(round((abs((len(dfz.index) - len(df.index)))/len(df.index))*100,2)) +'% of data was not in ROI.')

-76.89956394962675 39.03136761862695
-76.96882684758906 38.9679312
3.96% of data was not in ROI.


### 2) Remove duplicate events with same trip_ID

In [10]:
#Removing duplicates where Trip_ID and Bike_Event are the same. This will remove rows corresponding to 
#multiple foreground/background events
print(len(dfz.Trip_ID.unique())) #79,222 unique trips in the ROI
#print(dfz[dfz['Trip_ID']== 'L3QADAJtdmgLdenDj'])
dfz_noDupes = dfz.drop_duplicates(subset=["Trip_ID", 'Bike_Event'])
len(dfz_noDupes[dfz_noDupes['Trip_ID']== 'L3QADAJtdmgLdenDj'])
print(dfz_noDupes[dfz_noDupes['Trip_ID']== 'L3QADAJtdmgLdenDj'])

79222
******************************************************
       Coords_Latitude  Coords_Longitude            Trip_ID  \
54191        38.992974        -76.937454  L3QADAJtdmgLdenDj   
54192        38.992974        -76.937454  L3QADAJtdmgLdenDj   
54194        38.993227        -76.941564  L3QADAJtdmgLdenDj   

                  Bike_Event            User_ID                 Date_Time  \
54191              StartTrip  36xxpQuhZWX7oAwdD  2017-07-20T13:46:48.005Z   
54192             Foreground  36xxpQuhZWX7oAwdD  2017-07-20T13:46:58.407Z   
54194  EndTripInsideGeofence  36xxpQuhZWX7oAwdD  2017-07-20T14:35:43.494Z   

             Date      Time  
54191  2017-07-20  13:46:48  
54192  2017-07-20  13:46:58  
54194  2017-07-20  14:35:43  


### 3)  Convert lat/long to closest station, so each event is associated with the station it occurs at.

In [11]:
#First, we only care about start and end trips, since other types of events can be far from the station.
print(len(dfz_noDupes.index)) #499354 non-duplicate events

dfx = dfz_noDupes[(dfz_noDupes['Bike_Event'] == 'StartTrip') | (dfz_noDupes['Bike_Event'] == 'EndTripInsideGeofence') | (dfz_noDupes['Bike_Event'] == 'EndTripStoppedByGeofence') |(dfz_noDupes['Bike_Event'] == 'EndTripBypassGeofence')]
dfx.Bike_Event.unique() #'StartTrip','EndTripInsideGeofence','EndTripStoppedByGeofence','EndTripBypassGeofence
print(len(dfx.index)) #156602

499354
156602


In [19]:
#Get a dataframe of stations and their lat/longs
dfStations = pd.read_csv('../data/stationsInfo.csv')
print(dfStations.head(2))

                        Name  Altitude  Longitude   Latitude
0  Hollywood Shopping Center        42 -76.921340  39.013710
1       Greenbelt Metro West        30 -76.913859  39.010546


In [20]:
df_withStation = dfx.copy()
df_withStation['StationName'] = None
print(df_withStation.head(2))

   Coords_Latitude  Coords_Longitude            Trip_ID Bike_Event  \
0        38.978141        -76.928956  Hf3aj78RcGaJJMGRn  StartTrip   
1        38.978200        -76.928471  tp7QKuiJX9DvLKDcf  StartTrip   

             User_ID                 Date_Time        Date      Time  \
0  24Tswou857XKT9R65  2017-06-14T19:02:21.593Z  2017-06-14  19:02:21   
1  24Tswou857XKT9R65  2017-06-15T20:07:14.437Z  2017-06-15  20:07:14   

  StationName  
0        None  
1        None  


In [14]:
def getDist(long1,lat1,long2,lat2):
    return np.sqrt((long1-long2)**2 + (lat1-lat2)**2)

def getClosestStation(row,dfStations):
    long = row['Coords_Longitude']
    lat = row['Coords_Latitude']
    shortestDist = sys.float_info.max
    for i in range (0,len(dfStations.index)):
        d = getDist(long,lat,dfStations.iloc[i].Longitude,dfStations.iloc[i].Latitude)
        if (d < shortestDist): 
            shortestDist = d
            longF = dfStations.iloc[i].Longitude
            latF = dfStations.iloc[i].Latitude
            stationName = dfStations.iloc[i].Name
    return pd.Series([latF,longF, stationName])

df_withStation = dfx.copy()
df_withStation['StationName'] = None
df_withStation[['Coords_Latitude','Coords_Longitude','StationName']] = dfx.apply(getClosestStation,args=(dfStations,), axis=1)


## Save Start/End trips with associated stations to CSV

In [None]:
df_withStation.to_csv('transformedData/mbike_NamedStations_StartEndOnly.csv')
df_withStation.head(2)

## Plotting

### Plot each station using Folium

In [18]:
map_CP = folium.Map(location=[dfStations.Latitude.mean(), dfStations.Longitude.mean()], zoom_start=15)
for i in range (0,len(dfStations.index)):
    lat = dfStations.Latitude.iloc[i]
    long = dfStations.Longitude.iloc[i]
    folium.Circle(
        location=[lat, long],
        #popup=data.iloc[i]['name'],
        radius=50,
        color='crimson',
        fill=True,
        fill_color='crimson'
    ).add_to(map_CP)
map_CP