In [None]:
def read_311_data(table):
    import pandas as pd
    import numpy as np
    #Add the fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 19999:
            return np.NaN
        return str(input_zip)
    
    #Read the file
    df = pd.read_csv(table,index_col='Unique Key')
    
    #fix the zip
    df['Incident Zip'] = df['Incident Zip'].apply(fix_zip)
    
    #drop all rows that have any nans in them (note the easier syntax!)
    
    df = df.dropna(how='any')
    
    #get rid of unspecified boroughs
    df = df[df['Borough'] != 'Unspecified']
    
    #Convert times to datetime and create a processing time column
    
    import datetime
    df['Created Date'] = df['Created Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['Closed Date'] = df['Closed Date'].apply(lambda x:datetime.datetime.strptime(x,'%m/%d/%Y %I:%M:%S %p'))
    df['processing_time'] =  df['Closed Date'] - df['Created Date']
    
    #Finally, get rid of negative processing times and return the final data frame
    
    df = df[df['processing_time']>=datetime.timedelta(0,0,0)]
    
    return df
df = read_311_data('input_data')
df.info()

In [None]:
table = 'input_data'
data = read_311_data(table)
data

In [None]:
pip install gmplot --upgrade

In [None]:
# drawing a heatmap that helps see the relative concentration of complaints using lats and lons
    # Set up the map
    # GoogleMapPlotter(center_lat, center_lon, zoom)
    # from_geocode(location_string,zoom)

import gmplot

#Two ways to centre a map

gmap = gmplot.GoogleMapPlotter(40.7128, -74.0059, 8)

# OR

#gmap = gmplot.GoogleMapPlotter.from_geocode("New York", 10)

In [None]:
# Then we generate the heatmap passing the two data series (latitude and longitude) to the function
%matplotlib inlin
gmap.heatmap(data['Latitude'], data['Longitude'])

In [None]:
# Saving the heatmanp to an html file that be viewed, printed or include in another html page
%matplotlib inline
gmap.draw('incidents.html')

In [None]:
# Using Groupby
# Incidents by Agency
agency_group = data.groupby('Agency')
agency_group.size().plot(kind = 'bar')

In [None]:
# combining two groups into a singel graph
agency_borough = data.groupby(['Agency', 'Borough'])
agency_borough.size().plot(kind = 'bar')

In [None]:
# Unstacking the groups so as to get borough by agency
agency_borough.size().unstack().plot(kind = 'bar')

In [None]:
# Increasing the image size and adding a title
agency_borough = data.groupby(['Agency', 'Borough'])
agency_borough.size().unstack().plot(kind = 'bar', title = 'Incidents in each Agency by Borough', figsize = (15, 15))

In [None]:
# SEASONAL ANALYSIS
# Incidents by time, Since we know the creation date of each incident, we can bild a bar graph of number of incidents by month
# First create a new date field yyyymm
import datetime
data['yyyymm'] = data['Created Date'].apply(lambda x: datetime.datetime.strftime(x, '%Y%m'))
data['yyyymm']

In [None]:
import numpy as np
data_agency = data.groupby(['yyyymm', 'Agency'])
data_agency.size().unstack().plot(kind = 'bar', figsize = (12, 12))

In [None]:
# Examining Agencies. We'll look at the frequency by agency and report the top 5 values
data.groupby('Agency').size().sort_values(ascending=False)

In [None]:
data.groupby('Agency').size().sort_values(ascending=False).plot(kind = 'bar', figsize = (20, 4))

In [None]:
# Drilling down into complaints by Agency by borough
agency_borough = data.groupby(['Agency', 'Borough']).size().unstack()
agency_borough

In [None]:
# We can create 'top 5 Agency' subplots. Subplots for each borough
# Arranging the subplots in two rows and three columns. Since there are 5 borough, one plot will be blank

COL_NUM = 2
ROW_NUM = 3
import matplotlib.pyplot as plt
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize = (12, 12))

for i, (label, col) in enumerate(agency_borough.iteritems()):
    ax = axes[int(i/COL_NUM), i%COL_NUM]
    col = col.sort_values(ascending = False)[:5]
    col.plot( kind = 'barh', ax = ax)
    ax.set_title(label)
plt.tight_layout()

In [None]:
# Computing simple statistics on processing time

grouped = data[['processing_time', 'Borough']].groupby('Borough')
grouped.describe()

In [None]:
# converting the timedelta processing_time into floats for calculation purposes
data['float_time'] = data['processing_time'].apply(lambda x: x/np.timedelta64(1, 'D'))
data

In [None]:
# Now computing stats is easy using from the above code
grouped = data[['float_time', 'Agency']].groupby('Agency')
grouped.mean().sort_values('float_time', ascending=False)

In [None]:
data['float_time'].hist(bins = 50)