In [7]:
import os
os.environ['PYSPARK_PYTHON'] = '/nfshome/lj1230/.conda/envs/myEnv/bin/python3.5'

from pyspark import SparkContext
sc = SparkContext('local', 'pyspark')

from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

# Data loading

In [16]:
def createIndex(shapefile):
    import rtree
    import fiona.crs
    import geopandas as gpd
    zones = gpd.read_file(shapefile).to_crs(fiona.crs.from_epsg(2263))
    index = rtree.Rtree()
    for idx, geometry in enumerate(zones.geometry):
        index.insert(idx, geometry.bounds)
    return (index, zones)

def findZone(p, index, zones):
    match = index.intersection((p.x, p.y, p.x, p.y))
    for idx in match:
        if zones.geometry[idx].contains(p):
            return idx
    return None

def processTrips(pid, records):
    import csv
    import pyproj
    import shapely.geometry as geom
    
    proj = pyproj.Proj(init="epsg:2263", preserve_units=True)
    index, zones = createIndex("neighborhoods.geojson")
    area = gpd.read_file("neighborhoods.geojson")[["neighborhood", "borough"]].to_dict()
    
    if pid == 0:
        next(records)
    reader = csv.reader(records)
    
    counts = {}
    for row in reader:
        try:
            if len(row) == 6 and row[2] and row[3] and row[4] and row[5]:
                p1 = geom.Point(proj(float(row[5]), float(row[4])))  # dropoff, end in borough
                p2 = geom.Point(proj(float(row[3]), float(row[2])))  # pickup, end in neighbors
                zone1 = findZone(p1, index, zones)
                zone2 = findZone(p2, index, zones)
                if zone1 and zone2:
                    zone1 = area["borough"][zone1]
                    zone2 = area["neighborhood"][zone2]
                    if zone1 in counts:
                        counts[zone1][zone2] = counts[zone1].get(zone2, 0) + 1
                    else:
                        counts[zone1] = {zone2: 1}
        except:
            continue
    return counts.items()

if __name__=="__main__":
    count = sc.textFile('yellow.csv.gz').mapPartitionsWithIndex(processTrips).collect()
    
    for (key, values) in count:
        for top in sorted(values.items(), key=lambda x: -x[1])[:3]:
            print(key, top[0], top[1])

Queens Upper East Side 5
Queens Long Island City 4
Queens East Village 3
Brooklyn Williamsburg 6
Brooklyn Midtown 2
Brooklyn East Village 2
Manhattan Upper East Side 50
Manhattan Chelsea 49
Manhattan Upper West Side 39
Bronx East Harlem 1
Bronx North Riverdale 1
Bronx Hell's Kitchen 1
