## Scenario/Stakeholder Based Analysis of NYC taxi rides data
##### Authors: Panini Mokrala, Dmitrii Danilov

In [8]:
!pip install PyShp
!pip install sodapy

import io
import zipfile
import pandas as pd
import shapefile
import requests
from shapely.geometry import Polygon
from sodapy import Socrata
import datetime as dt

Collecting sodapy
  Downloading https://files.pythonhosted.org/packages/9e/74/95fb7d45bbe7f1de43caac45d7dd4807ef1e15881564a00eef489a3bb5c6/sodapy-2.1.0-py2.py3-none-any.whl
Installing collected packages: sodapy
Successfully installed sodapy-2.1.0


In [10]:
def import_taxi_zones():
    taxi_zones = requests.get('https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip')
    with zipfile.ZipFile(io.BytesIO(taxi_zones.content), 'r') as myzip:
        sf = shapefile.Reader(shp=myzip.open('taxi_zones.shp'),
                              shx=myzip.open('taxi_zones.shx'),
                              dbf=myzip.open('taxi_zones.dbf'))

    fields = [x[0].lower() for x in sf.fields][1:]
    records = sf.records()
    shps = [Polygon(s.points).wkt for s in sf.shapes()]
    df = pd.DataFrame(columns=fields, data=records)
    df = df.assign(coords=shps)
    
    return df

In [11]:
taxi_zones = import_taxi_zones()
taxi_zones.head()

Unnamed: 0,objectid,shape_leng,shape_area,zone,locationid,borough,coords
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.9183527103 192536.0856972019,..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"POLYGON ((1033269.243591294 172126.0078125, 10..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.769506663 256767.6975403726,..."
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,"POLYGON ((992073.4667968601 203714.0759887695,..."
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,"POLYGON ((935843.3104932606 144283.335850656, ..."


In [41]:
client = Socrata("data.cityofnewyork.us", 'erkBtGgCm1QXwrGaILeRCD1Xw', timeout=500)
start_time = dt.datetime.now()
results = client.get("t29m-gskq", query="select date_trunc_ym(tpep_pickup_datetime) as month, count(*) group by month")
end_time = dt.datetime.now()
print(f'Duration: {(end_time - start_time).seconds} sec')

results_df = pd.DataFrame.from_records(results)
print(results_df)

Duration: 69 sec
                      month     count
0   2001-01-01T00:00:00.000        14
1   2002-12-01T00:00:00.000        28
2   2003-01-01T00:00:00.000        17
3   2003-12-01T00:00:00.000         1
4   2008-12-01T00:00:00.000       406
5   2009-01-01T00:00:00.000       585
6   2017-01-01T00:00:00.000         2
7   2017-09-01T00:00:00.000         1
8   2017-12-01T00:00:00.000       224
9   2018-01-01T00:00:00.000   8760090
10  2018-02-01T00:00:00.000   8493469
11  2018-03-01T00:00:00.000  18858487
12  2018-04-01T00:00:00.000   9305358
13  2018-05-01T00:00:00.000   9224100
14  2018-06-01T00:00:00.000   8713711
15  2018-07-01T00:00:00.000   7849588
16  2018-08-01T00:00:00.000   7849042
17  2018-09-01T00:00:00.000   8039936
18  2018-10-01T00:00:00.000   8821141
19  2018-11-01T00:00:00.000   8145740
20  2018-12-01T00:00:00.000   8172459
21  2019-01-01T00:00:00.000        83
22  2019-02-01T00:00:00.000        31
23  2019-03-01T00:00:00.000        20
24  2019-04-01T00:00:00.000      