# 2. Bike stations and usage data

In this notebook, preprocessing and the exploration of the bike stations and bike check-ins data is done. Data from Washington D.C. downloaded from https://www.capitalbikeshare.com/system-data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName('BigData').getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [3]:
stations = spark.read.option("header", True).csv("hdfs://localhost:9000/Data/D.C/New/stations.csv")

In [4]:
stations.limit(10)

id,name,terminalName,lastCommWithServer,lat,long,installed,locked,installDate,removalDate,temporary,public,nbBikes,nbEmptyDocks,latestUpdateTime
1,Eads St & 15th St S,31000,1590835639505,38.858971,-77.05323,True,False,0,,False,True,12,2,1590827288583
2,18th St & S Eads St,31001,1590835561779,38.85725,-77.05332,True,False,0,,False,True,2,9,1590826086864
3,Crystal Dr & 20th...,31002,1590835597498,38.856425,-77.049232,True,False,0,,False,True,13,4,1590826087288
4,Crystal Dr & 15th...,31003,1590835631929,38.86017,-77.049593,True,False,0,,False,True,10,0,1590801415972
5,Aurora Hills Cmty...,31004,1590835590411,38.857866,-77.05949,True,False,0,,False,True,0,11,1590825786529
6,Pentagon City Met...,31005,1590835594427,38.862303,-77.059936,True,False,0,,False,True,19,0,1590828803946
7,Army Navy Dr & S ...,31006,1590835550630,38.8637,-77.0633,True,False,0,,False,True,9,6,1590826087311
8,Crystal City Metr...,31007,1590835577737,38.85740496261504,-77.05113172531128,True,False,0,,False,True,18,1,1590829423555
10,Crystal Dr & 27th...,31009,1590835553998,38.848466,-77.051514,True,False,0,,False,True,4,6,1590826386239
11,S Glebe Rd & Poto...,31010,1590835639782,38.8426,-77.0502,True,False,0,,False,True,5,6,1590825786515


In [5]:
stations = stations.select('id', 'name', 'terminalName', 'lat', 'long', 'nbBikes', 'nbEmptyDocks')

In [6]:
stations.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- terminalName: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- long: string (nullable = true)
 |-- nbBikes: string (nullable = true)
 |-- nbEmptyDocks: string (nullable = true)



In [7]:
stations = stations.withColumn('lat', stations.lat.cast(DoubleType()))
stations = stations.withColumn('long', stations.long.cast(DoubleType()))
stations = stations.withColumn('nbBikes', stations.nbBikes.cast(IntegerType()))
stations = stations.withColumn('nbEmptyDocks', stations.nbEmptyDocks.cast(IntegerType()))

In [8]:
from ipyleaflet import Map, Marker, MarkerCluster

station_rows = stations.collect()
markers = [Marker(location=(x.lat, x.long)) for x in station_rows]

m = Map(center=(38.900497, -77.007507), zoom=10)

marker_cluster = MarkerCluster(
    markers=(markers)
)

m.add_layer(marker_cluster);

m

Map(center=[38.900497, -77.007507], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title'…

In [9]:
#COLLECT BIKE USAGE DATA FOR ALL MONTHS
bike_data_raw = spark.read.option("header", True).csv("hdfs://localhost:9000/Data/D.C/New/bikedata/*-capitalbikeshare-tripdata/*-capitalbikeshare-tripdata.csv")
bike_data = bike_data_raw

In [11]:
bike_data.limit(10)

Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
357,2019-08-01 00:00:34,2019-08-01 00:06:31,31117,15th & Euclid St NW,31115,Columbia Rd & Geo...,W21052,Member
1100,2019-08-01 00:01:48,2019-08-01 00:20:09,31407,14th St & Colorad...,31115,Columbia Rd & Geo...,W01101,Member
1406,2019-08-01 00:04:04,2019-08-01 00:27:31,31407,14th St & Colorad...,31234,20th & O St NW / ...,W20870,Casual
3667,2019-08-01 00:05:09,2019-08-01 01:06:16,31907,Franklin St & S W...,31907,Franklin St & S W...,W00966,Member
282,2019-08-01 00:05:13,2019-08-01 00:09:56,31201,15th & P St NW,31234,20th & O St NW / ...,W20443,Member
3596,2019-08-01 00:05:14,2019-08-01 01:05:10,31907,Franklin St & S W...,31907,Franklin St & S W...,W21108,Member
1095,2019-08-01 00:05:41,2019-08-01 00:23:57,31246,M St & Pennsylvan...,31295,Potomac & M St NW,W01369,Casual
527,2019-08-01 00:05:59,2019-08-01 00:14:47,31272,Washington & Inde...,31269,3rd St & Pennsylv...,W20165,Casual
619,2019-08-01 00:06:14,2019-08-01 00:16:33,31218,L'Enfant Plaza / ...,31650,1st & M St SE,W01248,Member
1000,2019-08-01 00:06:40,2019-08-01 00:23:21,31246,M St & Pennsylvan...,31295,Potomac & M St NW,W00222,Member


In [12]:
bike_data.printSchema()

root
 |-- Duration: string (nullable = true)
 |-- Start date: string (nullable = true)
 |-- End date: string (nullable = true)
 |-- Start station number: string (nullable = true)
 |-- Start station: string (nullable = true)
 |-- End station number: string (nullable = true)
 |-- End station: string (nullable = true)
 |-- Bike number: string (nullable = true)
 |-- Member type: string (nullable = true)



In [16]:
from pyspark.sql.functions import col, size

sample = bike_data_raw.where((bike_data_raw['Start station number'] =='31114') & (bike_data_raw['End station number'] =='31282') & (bike_data_raw['Duration'] == '299'))
sample = sample.withColumn('start_date', F.to_timestamp(lit(sample['Start date'], "yyyy-MM-dd HH:mm:ss"))
sample = sample.withColumn('end_date', F.to_timestamp(sample['End date'], "yyyy-MM-dd HH:mm:ss"))

SyntaxError: invalid syntax (<ipython-input-16-5b9784f11700>, line 5)

In [17]:
sample = bike_data_raw.where(bike_data_raw['Start date'].rlike('2019-03-31 02'))
sample.count()

89

In [49]:
bad_dates = sample.select('Start date')

In [50]:
bad_dates.show()

+-------------------+
|         Start date|
+-------------------+
|2019-03-31 02:00:49|
|2019-03-31 02:01:21|
|2019-03-31 02:02:01|
|2019-03-31 02:02:51|
|2019-03-31 02:03:01|
|2019-03-31 02:03:38|
|2019-03-31 02:04:11|
|2019-03-31 02:04:14|
|2019-03-31 02:04:52|
|2019-03-31 02:05:39|
|2019-03-31 02:05:50|
|2019-03-31 02:06:43|
|2019-03-31 02:07:07|
|2019-03-31 02:07:20|
|2019-03-31 02:07:28|
|2019-03-31 02:08:43|
|2019-03-31 02:09:28|
|2019-03-31 02:09:58|
|2019-03-31 02:10:10|
|2019-03-31 02:10:26|
+-------------------+
only showing top 20 rows



In [53]:
sample = sample.withColumn('start_date', F.to_timestamp(F.lit(sample['Start date']), "yyyy-MM-dd HH:mm:ss"))
sample = sample.withColumn('end_date', F.to_timestamp(F.lit(sample['End date']), "yyyy-MM-dd HH:mm:ss"))
sample

Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type,start_date,end_date
492,2019-03-31 02:00:49,2019-03-31 02:09:01,31614,11th & H St NE,31630,15th & East Capit...,W01424,Member,,
861,2019-03-31 02:01:21,2019-03-31 02:15:43,31202,14th & R St NW,31103,16th & Harvard St NW,W01301,Member,,
338,2019-03-31 02:02:01,2019-03-31 02:07:40,31104,Adams Mill & Colu...,31105,14th & Harvard St NW,W20793,Member,,
1639,2019-03-31 02:02:51,2019-03-31 02:30:10,31508,Gallaudet / 8th S...,31119,14th & Belmont St NW,78240,Member,,
267,2019-03-31 02:03:01,2019-03-31 02:07:29,31102,11th & Kenyon St NW,31120,10th & Florida Av...,75651,Member,,
763,2019-03-31 02:03:38,2019-03-31 02:16:22,31281,8th & O St NW,31611,13th & H St NE,W23790,Member,,
481,2019-03-31 02:04:11,2019-03-31 02:12:13,31114,18th St & Wyoming...,31202,14th & R St NW,W01194,Member,,
260,2019-03-31 02:04:14,2019-03-31 02:08:34,31020,Wilson Blvd & N F...,31029,N Veitch St & 20t...,W00996,Member,,
1131,2019-03-31 02:04:52,2019-03-31 02:23:44,31114,18th St & Wyoming...,31265,5th St & Massachu...,W00504,Member,,
1338,2019-03-31 02:05:39,2019-03-31 02:27:57,31225,C & O Canal & Wis...,31225,C & O Canal & Wis...,W21930,Casual,,


In [30]:
sample

Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type,start_date,end_date
299,2019-03-31 02:09:28,2019-03-31 02:14:28,31114,18th St & Wyoming...,31282,16th & R St NW,W22313,Member,,


In [36]:
bike_data = bike_data.withColumn('duration', bike_data['Duration'].cast(IntegerType()))
bike_data = bike_data.withColumn('start_date', F.to_timestamp(bike_data['Start date'], "yyyy-MM-dd HH:mm:ss"))
bike_data = bike_data.withColumn('end_date', F.to_timestamp(bike_data['End date'], "yyyy-MM-dd HH:mm:ss"))
bike_data = bike_data.withColumn('start_station_id', bike_data['Start station number'])
bike_data = bike_data.withColumn('start_station_name', bike_data['Start station'])
bike_data = bike_data.withColumn('end_station_id', bike_data['End station number'])
bike_data = bike_data.withColumn('end_station_name', bike_data['End station'])
bike_data = bike_data.withColumn('bike_number', bike_data['Bike number'])
bike_data = bike_data.withColumn('member_type', bike_data['Member type'])

bike_data = bike_data.select('duration', 'start_date', 'end_date', 'start_station_id', 'start_station_name', 'end_station_id', 'end_station_name', 'bike_number', 'member_type')

Next thing we want to do is to extract all stations that are mentioned in bike usage data start and end locations.

In [37]:
#HELP function
#here is the mistake

sorted_bikes = bike_data.sort('start_date')
sorted_bikes.show(100)

+--------+-------------------+-------------------+----------------+--------------------+--------------+--------------------+-----------+-----------+
|duration|         start_date|           end_date|start_station_id|  start_station_name|end_station_id|    end_station_name|bike_number|member_type|
+--------+-------------------+-------------------+----------------+--------------------+--------------+--------------------+-----------+-----------+
|     677|               null|               null|           31202|      14th & R St NW|         31519|       1st & O St NW|     W00390|     Member|
|    1830|               null|               null|           31401|14th St & Spring ...|         31408|        Takoma Metro|     W20879|     Member|
|     578|               null|               null|           31045|Commerce St & Fay...|         31042|Market Square / K...|     W23700|     Member|
|     925|               null|2019-03-31 03:04:50|           31125|      15th & W St NW|         31280|   

In [13]:
start_stations = bike_data.select('start_station_id').distinct()
start_stations = start_stations.withColumn('station_id', start_stations['start_station_id']).select('station_id')

In [14]:
end_stations = bike_data.select('end_station_id').distinct()
end_stations = end_stations.withColumn('station_id', end_stations['end_station_id']).select('station_id')

In [15]:
end_stations.count()

581

We have one different station, so we create a union and and collect all station information with relevant ids.

In [16]:
#all mentioned stations difference 1 station
start_stations.subtract(end_stations)

station_id
31718


In [17]:
all_used_stations = start_stations.union(end_stations).distinct()
used_stations = all_used_stations.join(stations, all_used_stations.station_id == stations.terminalName)

We can see that we have a different number when joining used data with the originally listed ones. The difference is in 4 stations.

In [18]:
print(all_used_stations.count())
print(used_stations.count())

582
578


In [19]:
all_used_stations.select('station_id').subtract(stations.select('terminalName'))

station_id
31086
31008
32031
0


In [20]:
undefined = bike_data.where(bike_data['start_station_id'].isin("31086", "00000", "31008", "32031")).select('start_station_id', 'start_station_name')

In [21]:
undefined.where(undefined['start_station_id'].like("00000")).count()

498

In [22]:
undefined_distinct = undefined.select('start_station_name', 'start_station_id').distinct()

In [23]:
undefined_list = undefined_distinct.collect()

In [24]:
undefined_list[1]['start_station_name']

'22nd & H St NW'

In [25]:
stations.where(stations['name'].like('22nd & H St NW'))

id,name,terminalName,lat,long,nbBikes,nbEmptyDocks
609,22nd & H St NW,31127,38.89892546716842,-77.04885238395946,0,17


In [26]:
undefined_end = bike_data.where(bike_data['end_station_id'].isin("31086", "00000", "31008", "32031")).select('end_station_id', 'end_station_name')

In [27]:
undefined_end.where(undefined_end['end_station_id'].like("00000")).count()

487

In [28]:
stations = stations.select('id', 'name', 'terminalName', 'lat', 'long', 'nbBikes', 'nbEmptyDocks')

In [29]:
stations.count()

596

In [30]:
station_columns = ['id', 'name', 'terminalName', 'lat', 'long', 'nbBikes', 'nbEmptyDocks']
new_station = spark.createDataFrame([(597, 'Mount Vernon Ave & E Del Ray Ave', '31086', 38.826213, -77.058640, 5, 10)], station_columns)
new_station

id,name,terminalName,lat,long,nbBikes,nbEmptyDocks
597,Mount Vernon Ave ...,31086,38.826213,-77.05864,5,10


In [31]:
stations = stations.union(new_station)

In [32]:
stations.where(stations['terminalName'].like('31086'))

id,name,terminalName,lat,long,nbBikes,nbEmptyDocks
597,Mount Vernon Ave ...,31086,38.826213,-77.05864,5,10


In [33]:
import pyspark.sql.functions as F

bike_data = bike_data.withColumn('start_station_id', F.when(bike_data['start_station_id'].like('00000'), '31127').otherwise(bike_data['start_station_id']))
bike_data = bike_data.withColumn('end_station_id', F.when(bike_data['end_station_id'].like('00000'), '31127').otherwise(bike_data['end_station_id']))

In [34]:
bike_data.count()

2681728

In [35]:
bike_data = bike_data.withColumn('start_station_id', F.when(bike_data['start_station_id'].isin('31008', '32031'), None).otherwise(bike_data['start_station_id']))
bike_data = bike_data.withColumn('end_station_id', F.when(bike_data['end_station_id'].isin('31008', '32031'), None).otherwise(bike_data['end_station_id']))
bike_data = bike_data.dropna(subset=['start_station_id', 'end_station_id'])

In [36]:
bike_data.count()

2681369

In [37]:
all_used_stations = bike_data.select('start_station_id').distinct()
used_stations = all_used_stations.join(stations, all_used_stations.start_station_id == stations.terminalName)

In [38]:
from ipyleaflet import Map, Marker, MarkerCluster

used_stations = used_stations.collect()
used_markers = [Marker(location=(x.lat, x.long)) for x in used_stations]

m = Map(center=(38.900497, -77.007507), zoom=10)

marker_cluster = MarkerCluster(
    markers=(used_markers)
)

m.add_layer(marker_cluster);

m

Map(center=[38.900497, -77.007507], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title'…

Now, let's calculate distance between stations.

In [39]:
stations.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- terminalName: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- nbBikes: long (nullable = true)
 |-- nbEmptyDocks: long (nullable = true)



In [40]:
from pyspark.sql.types import *
from pyspark.sql import functions as F
from math import radians, cos, sin, asin, sqrt

#Haversine distance
def get_distance(longit_a, latit_a, longit_b, latit_b):
    # Transform to radians
    longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])
    dist_longit = longit_b - longit_a
    dist_latit = latit_b - latit_a
    # Calculate area
    area = sin(dist_latit/2)**2 + cos(latit_a) * cos(latit_b) * sin(dist_longit/2)**2
    # Calculate the central angle
    central_angle = 2 * asin(sqrt(area))
    radius = 6371
    # Calculate Distance
    distance = central_angle * radius * 1000
    return abs(round(distance))

In [41]:
#calculate distance between stations - first 2 for now
udf_get_distance = F.udf(get_distance)
stations_partial = stations.select('id', 'lat', 'long')

In [42]:
station_pairs = (stations_partial.crossJoin(stations_partial).toDF(
"id_A","lat_A", "long_A", "id_B",  
 "lat_B", "long_B"))

In [43]:
station_pairs = (station_pairs.filter(
station_pairs.id_A != station_pairs.id_B))

In [44]:
pairs_distance = station_pairs.withColumn("abs_distance", udf_get_distance(
station_pairs.long_A, station_pairs.lat_A,
station_pairs.long_B, station_pairs.lat_B))

In [45]:
summary = pairs_distance.select(F.max("abs_distance"))

In [46]:
pairs_distance

id_A,lat_A,long_A,id_B,lat_B,long_B,abs_distance
1,38.858971,-77.05323,2,38.85725,-77.05332,192
1,38.858971,-77.05323,3,38.856425,-77.049232,447
1,38.858971,-77.05323,4,38.86017,-77.049593,342
1,38.858971,-77.05323,5,38.857866,-77.05949,556
1,38.858971,-77.05323,6,38.862303,-77.059936,689
1,38.858971,-77.05323,7,38.8637,-77.0633,1018
1,38.858971,-77.05323,8,38.85740496261504,-77.05113172531128,252
1,38.858971,-77.05323,10,38.848466,-77.051514,1178
1,38.858971,-77.05323,11,38.8426,-77.0502,1839
1,38.858971,-77.05323,12,38.8533,-77.0498,697


In [47]:
bike_data.printSchema()

root
 |-- duration: integer (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- end_date: timestamp (nullable = true)
 |-- start_station_id: string (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- end_station_id: string (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- bike_number: string (nullable = true)
 |-- member_type: string (nullable = true)



In [48]:
bike_data.groupBy('member_type').count()

member_type,count
Member,2346033
Casual,335336


In [49]:
bike_data

duration,start_date,end_date,start_station_id,start_station_name,end_station_id,end_station_name,bike_number,member_type
207,2019-09-01 00:00:27,2019-09-01 00:03:54,31246,M St & Pennsylvan...,31128,23rd & M St NW,W21128,Member
726,2019-09-01 00:00:46,2019-09-01 00:12:52,31124,14th & Irving St NW,31116,California St & F...,W01208,Member
4401,2019-09-01 00:00:48,2019-09-01 01:14:10,31129,15th St & Pennsyl...,31277,17th & G St NW,W23690,Casual
388,2019-09-01 00:01:01,2019-09-01 00:07:30,31648,Potomac Ave & Hal...,31609,Maine Ave & 7th S...,W20449,Member
537,2019-09-01 00:01:09,2019-09-01 00:10:07,31200,Massachusetts Ave...,31229,New Hampshire Ave...,W20206,Member
89,2019-09-01 00:01:39,2019-09-01 00:03:08,31232,7th & F St NW / N...,31620,5th & F St NW,W00942,Member
770,2019-09-01 00:02:05,2019-09-01 00:14:56,31129,15th St & Pennsyl...,31235,19th St & Constit...,W23948,Casual
422,2019-09-01 00:02:14,2019-09-01 00:09:16,31237,25th St & Pennsyl...,31293,31st & Water St NW,W00183,Casual
774,2019-09-01 00:02:14,2019-09-01 00:15:08,31129,15th St & Pennsyl...,31235,19th St & Constit...,W01337,Casual
405,2019-09-01 00:03:02,2019-09-01 00:09:47,31237,25th St & Pennsyl...,31289,Henry Bacon Dr & ...,W21345,Member


In [50]:
bike_data.groupBy('bike_number').count()

bike_number,count
W00474,745
W00826,529
W00254,555
W20296,532
W20676,678
W23518,456
W22623,552
W22547,731
W21144,594
W22173,799


In [51]:
bikes_partial = bike_data.select('start_station_id', 'end_station_id')
stations_partial = stations.select('terminalName','lat', 'long')
bikes_joined = bikes_partial.join(stations_partial.withColumnRenamed('lat','lat_start'), bike_data.start_station_id == stations.terminalName)

In [52]:
bikes_joined

start_station_id,end_station_id,terminalName,lat_start,long
31039,31092,31039,38.880012,-77.107854
31039,31038,31039,38.880012,-77.107854
31039,31089,31039,38.880012,-77.107854
31039,31215,31039,38.880012,-77.107854
31039,31920,31039,38.880012,-77.107854
31039,31901,31039,38.880012,-77.107854
31039,31089,31039,38.880012,-77.107854
31039,31075,31039,38.880012,-77.107854
31039,31003,31039,38.880012,-77.107854
31039,31905,31039,38.880012,-77.107854


In [53]:
bike_data.count()


2681369

In [54]:
stations.count()**2

356409

In [None]:
pairs_distance = station_pairs.withColumn("abs_distance", udf_get_distance(
station_pairs.long_A, station_pairs.lat_A,
station_pairs.long_B, station_pairs.lat_B))

In [57]:
 bikes = bike_data
 bikes_joined = bikes.join(stations_partial.withColumnRenamed('lat','lat_start').withColumnRenamed('long','long_start'), bikes.start_station_id == stations.terminalName)
 bikes_joined = bikes_joined.join(stations_partial.withColumnRenamed('lat','lat_end').withColumnRenamed('long','long_end'), bikes.end_station_id == stations.terminalName)

In [58]:
bikes_joined

duration,start_date,end_date,start_station_id,start_station_name,end_station_id,end_station_name,bike_number,member_type,terminalName,lat_start,long_start,terminalName.1,lat_end,long_end
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31000,38.858971,-77.05323
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31001,38.85725,-77.05332
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31002,38.856425,-77.049232
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31003,38.86017,-77.049593
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31004,38.857866,-77.05949
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31005,38.862303,-77.059936
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31006,38.8637,-77.0633
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31007,38.85740496261504,-77.05113172531128
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31009,38.848466,-77.051514
1315,2019-09-03 18:50:05,2019-09-03 19:12:00,31109,7th & T St NW,31109,7th & T St NW,W21554,Member,31109,38.9155,-77.0222,31010,38.8426,-77.0502


In [None]:
bikes = bikes_joined.withColumn("abs_distance", udf_get_distance(
bikes_joined.long_start, station_pairs.lat_start,
station_pairs.long_end, station_pairs.lat_end))