In [1]:
import pandas as pd
import numpy as np
from sodapy import Socrata

# Socrata specific 
socrata_domain = "data.cityofnewyork.us"
socrata_token = "F52GVJzdJf2mpjl7n17oXvzqF" # should be an environment variable
dataset_id = "fhrw-4uyv"

# set up socrate client endpoint
client = Socrata(socrata_domain, socrata_token)

# Data retrieval and cleaning

In [2]:
# get the total number of complaints made in 2017
num_complaints_2017 = client.get(dataset_id,
                    select="count(*)",
                    where="created_date between '2017-01-01T00:00:00.001' and '2017-12-31T23:59:59'")[0]['count']

print("Number of complaints in 2017: {}".format(num_complaints_2017))

Number of complaints in 2017: 2445387


In [3]:
# get only the necessary fields and the exact amount of complaints made in 2017
# starting from the 1st second into 2017 because there are about 50 complaints at time 00:00:00 and must be bogus
results = client.get(dataset_id,
                    select="unique_key, created_date, complaint_type, descriptor, incident_zip, borough, city",
                    where="created_date between '2017-01-01T00:00:00.001' and '2017-12-31T23:59:59'",
                    order="created_date ASC",
                    limit=num_complaints_2017)

# Convert to pandas DataFrame
complaints_df = pd.DataFrame.from_records(results)

In [4]:
# simple cleaning of data
# should be calculated if possible

# remove all rows with 'Unspecified' boroughs 
complaints_df = complaints_df.query("borough != 'Unspecified'")

# remove rows with invalid zip codes
complaints_df = complaints_df[complaints_df['incident_zip'].str.len() == 5]

# Top 10 complaint types by borough

In [5]:
# get the 10 most common overall complaint types
top_10_complaint_types = complaints_df.complaint_type.value_counts()[:10]

print("Number of unique complaint types in all boroughs:\n{}\n".format(complaints_df.complaint_type.nunique()))
print("Top 10 complaint types overall in all boroughs:\n{}".format(top_10_complaint_types))

Number of unique complaint types in all boroughs:
200

Top 10 complaint types overall in all boroughs:
Noise - Residential        229458
HEAT/HOT WATER             213477
Illegal Parking            144827
Blocked Driveway           135056
Street Condition            90341
UNSANITARY CONDITION        79260
Noise - Street/Sidewalk     72867
Water System                63691
Noise                       59466
PAINT/PLASTER               57069
Name: complaint_type, dtype: int64


In [6]:
# create pivot table using the boroughs as the index and a count of each unique complaint type
complaints_by_borough = complaints_df.pivot_table(index='borough', columns='complaint_type', aggfunc=len)

# get a cross section of a column without a mutlilevel index (shouldn't matter which column is chosen)
complaints_by_borough = complaints_by_borough.xs('city', axis=1, drop_level=True)

In [7]:
# print just the top 10 complaint types 
complaints_by_borough[top_10_complaint_types.index]

complaint_type,Noise - Residential,HEAT/HOT WATER,Illegal Parking,Blocked Driveway,Street Condition,UNSANITARY CONDITION,Noise - Street/Sidewalk,Water System,Noise,PAINT/PLASTER
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
BRONX,57663.0,68713.0,16122.0,24574.0,11155.0,24557.0,14025.0,9965.0,3056.0,19493.0
BROOKLYN,67629.0,66977.0,55380.0,49301.0,24875.0,26654.0,21313.0,19508.0,15227.0,19398.0
MANHATTAN,51026.0,46528.0,19686.0,3428.0,14496.0,14631.0,29146.0,10666.0,28754.0,11486.0
QUEENS,46396.0,29186.0,46065.0,54288.0,29495.0,11467.0,7530.0,18279.0,10523.0,5682.0
STATEN ISLAND,6744.0,2073.0,7574.0,3465.0,10320.0,1951.0,853.0,5273.0,1906.0,1010.0


In [8]:
# Verify previous pivot table results
# get the number of 10 most common overall complaint types PER BOROUGH
"""
# I only ran this to verify the output of the pivot table but it takes a while so...

for borough in complaints_df.borough.unique():
    for complaint in top_10_complaint_types.index:
        count = len(complaints_df[(complaints_df['borough'] == borough) & (complaints_df['complaint_type'] == complaint)])
        print("Borough: {}\tType: {}\tCount: {}".format(borough, complaint, count))
    print("")
"""

'\n# I only ran this to verify the output of the pivot table but it takes a while so...\n\nfor borough in complaints_df.borough.unique():\n    for complaint in top_10_complaint_types.index:\n        count = len(complaints_df[(complaints_df[\'borough\'] == borough) & (complaints_df[\'complaint_type\'] == complaint)])\n        print("Borough: {}\tType: {}\tCount: {}".format(borough, complaint, count))\n    print("")\n'

# Top 10 complaint types for top 10 most populous zipcodes

In [9]:
# Get 2010 census population by zipcode 
zipcode_df = pd.read_csv("https://s3.amazonaws.com/SplitwiseBlogJB/2010+Census+Population+By+Zipcode+(ZCTA).csv")

# renaming the columns for easier access
zipcode_df.rename(columns = {'Zip Code ZCTA':'ZCTA', '2010 Census Population':'Population'}, inplace = True)

# Filter the zipcode dataframe to include only NYC zipcodes (the ones in our 311 Service Request dataframe)
zipcode_df = zipcode_df[zipcode_df['ZCTA'].isin(complaints_df.incident_zip.unique())]

In [10]:
# sort the zipcode dataframe by population in descending order
zipcode_df.sort_values(by='Population', ascending=False, inplace=True)

# print top 10 most populous zip codes
zipcode_df[:10]

Unnamed: 0,ZCTA,Population
2748,11368,109931
2720,11226,101572
2753,11373,100820
2714,11220,99598
2759,11385,98592
2527,10467,97060
2451,10025,94600
2702,11208,94469
2729,11236,93877
2701,11207,93386


In [11]:
# top 10 most populous zip codes (converted to string for indexing)
top_10_most_populous_zips = [str(x) for x in zipcode_df['ZCTA'][:10].tolist()]

In [12]:
# create pivot table using the incident_zip as the index and a count of each unique complaint type
complaints_by_zip = complaints_df.pivot_table(index='incident_zip', columns='complaint_type', aggfunc=len)

# get a cross section of a column without a mutlilevel index (shouldn't matter which column is chosen)
complaints_by_zip = complaints_by_zip.xs('borough', axis=1, drop_level=True)

In [13]:
# print the top 10 complaints in the top 10 most populous zip codes
complaints_by_zip.loc[top_10_most_populous_zips][top_10_complaint_types.index]

complaint_type,Noise - Residential,HEAT/HOT WATER,Illegal Parking,Blocked Driveway,Street Condition,UNSANITARY CONDITION,Noise - Street/Sidewalk,Water System,Noise,PAINT/PLASTER
incident_zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11368,2460.0,1620.0,1250.0,4380.0,561.0,639.0,684.0,617.0,158.0,361.0
11226,4852.0,7569.0,1073.0,2200.0,491.0,3155.0,1831.0,406.0,440.0,2639.0
11373,1841.0,3408.0,1270.0,2623.0,691.0,756.0,304.0,372.0,270.0,265.0
11220,1522.0,1634.0,2012.0,1558.0,690.0,719.0,498.0,674.0,442.0,506.0
11385,2609.0,1526.0,4129.0,3039.0,1232.0,647.0,607.0,1240.0,507.0,352.0
10467,5803.0,6041.0,982.0,2067.0,612.0,2192.0,712.0,560.0,285.0,1955.0
10025,2085.0,2397.0,735.0,125.0,628.0,714.0,1224.0,481.0,1398.0,604.0
11208,2792.0,2052.0,2148.0,2751.0,816.0,1341.0,825.0,710.0,188.0,800.0
11236,1929.0,1145.0,1430.0,3038.0,1021.0,562.0,201.0,623.0,108.0,347.0
11207,3060.0,2461.0,1495.0,2061.0,1142.0,1621.0,558.0,757.0,199.0,1055.0


# Which boroughs are the biggest "complainers" relative to the size of the population in 2017?
# Calculate a complaint-index that adjusts for population of the borough.

In [14]:
# data structures to hold calculated data
population_by_borough = []
num_complaints_by_borough = []

# number of complaints by zip code
num_complaints_by_zip = complaints_df['incident_zip'].value_counts().reset_index()
num_complaints_by_zip.rename(columns = {'index':'ZCTA', 'incident_zip':'num_complaints'}, inplace = True)

# iterate over boroughs
for borough in complaints_df.borough.unique():
    # get zips for borough
    zips_of_borough = complaints_df.query('borough == @borough')['incident_zip']
    
    # sum population of each zipcode in the borough
    population = np.sum(zipcode_df[zipcode_df['ZCTA'].isin(zips_of_borough)]['Population'])
    population_by_borough.append( (borough, population) )
    
    # sum number of complaints by zipcode in the borough
    num_complaints = np.sum(num_complaints_by_zip[num_complaints_by_zip['ZCTA'].isin(zips_of_borough)]['num_complaints'])
    num_complaints_by_borough.append( (borough, num_complaints) )

In [15]:
# the population of each borough
print("Population of each borough:\n{}".format(population_by_borough))
print("Number of complaints for each borough:\n{}".format(num_complaints_by_borough))

Population of each borough:
[('BRONX', 1592084), ('QUEENS', 2755424), ('BROOKLYN', 2732303), ('MANHATTAN', 2073659), ('STATEN ISLAND', 468730)]
Number of complaints for each borough:
[('BRONX', 467765), ('QUEENS', 685309), ('BROOKLYN', 820071), ('MANHATTAN', 606223), ('STATEN ISLAND', 125497)]


In [21]:
print("There is a discrepancy in the total number of complaints and the total number of complaints by borough.")
print("Im not sure why it is but maybe because some zipcodes are in 2 boroughs and are therefore counted twice\n")

print("Total Number of complaints: {}".format(sum(num_complaints_by_zip.num_complaints)))
print("Total Number of complaints after separating by borough: {}".format(sum(p[1] for p in num_complaints_by_borough)))

There is a discrepancy in the total number of complaints and the total number of complaints by borough.
Im not sure why it is but maybe because some zipcodes are in 2 boroughs and are therefore counted twice

Total Number of complaints: 2338742
Total Number of complaints after separating by borough: 2704865


In [29]:
# calculate population-adjusted complaint-index for each borough
# The complaint-index measures the number of complaints in a borough compared to other boroughs in NYC

# calcualate ratio of complaints to population size for each borough
# get the population size of each borough relative to the total population of NYC
# adjusted complaint-index is the ratio of complaints in the borough relative to borough population share in NYC

total_nyc_population = np.sum(int(p[1]) for p in population_by_borough)

complaint_index = []
for i in range(len(population_by_borough)):
    pct_complaints_for_borough = num_complaints_by_borough[i][1]/population_by_borough[i][1]
    relative_population = population_by_borough[i][1]/total_nyc_population
    
    adjusted_complaint_index = pct_complaints_for_borough/relative_population
    complaint_index.append((population_by_borough[i][0], adjusted_complaint_index))

In [36]:
# sort the list by largest complaint index and print
complaint_index.sort(key=lambda x: x[1], reverse=True)
print("The biggest complainers in descending order using complaint-index:\n{}".format(complaint_index))

The biggest complainers in descending order using complaint-index:
[('STATEN ISLAND', 5.4961961409611755), ('BRONX', 1.775702258247686), ('MANHATTAN', 1.3565384470550297), ('BROOKLYN', 1.056983276414725), ('QUEENS', 0.8685282345258203)]
