## Review class

In this review we will download and process a large **.csv** file.  We choose the **Victoria Police Department Crime Database**.

In [None]:
%%bash
wget https://moto.data.socrata.com/api/views/f42u-v6f3/rows.csv?accessType=DOWNLOAD -O vicpd.csv

In [None]:
fname = "vicpd.csv"
with open(fname) as f:
    content = f.readlines()
    #print(content[0])
    # the keys are the first line of the file, let's split it and save as a key array.
    # first, let's remove the \n.
    keys = content[0].translate({ord(c) : None for c in '\n'})
    keys = keys.split(",")
    ## good, now let's collect the records into a list of dicts.
    mList = []
    for i in range(1, len(content)):
        line = content[i].translate({ord(c) : None for c in '\n'})
        line = line.split(",")
        dat = {keys[j] : line[j] for j in range(len(line))}
        mList.append(dat)
    print(len(mList),  "records")

In [None]:
## next, let's make the records more useful. 
#print(mList[1])
import datetime as dt

## let's make the:
##  python datetime objects for incident_datetime, updated_at, created_at
for x in mList:
    x['incident_datetime'] = dt.datetime.strptime(x['incident_datetime'], "%m/%d/%Y %I:%M:%S %p")
    x['updated_at'] = dt.datetime.strptime(x['updated_at'], "%m/%d/%Y %I:%M:%S %p")
    x['created_at'] = dt.datetime.strptime(x['created_at'], "%m/%d/%Y %I:%M:%S %p")


print(mList[1])

In [None]:
##  make location a pair of floats
for x in mList:
    if x['address_2'].upper() != '':
        print(x)

In [None]:
##  check to see if some elements can be removed. country. zip. clearance_type. day_of_week. 
##    incident_description. state. address_2. longiitude. hour_of_day. etc.
for x in mList:
    del x['zip']
    del x['country']
    del x['day_of_week']
    del x['state']
    del x['clearance_type']
    del x['incident_description']
    del x['address_2']
    del x['hour_of_day']

In [None]:
print(mList[0])

In [None]:
## let's make latitude and longitude into floats, and make the location a pair. then erase latitude and longitude.
## first, let's purge the one record that does not have a location

newList = []
for x in mList:
    if x['longitude'] == '':
        continue
    newList.append(x)
mList = newList

for x in mList:
    x['longitude'] = float(x['longitude'])
    x['latitude'] = float(x['latitude'])
    x['location'] = (x['latitude'], x['longitude'])
    del x['longitude']
    del x['latitude']
    
print(mList[0])

In [None]:
## let's make a breakdown of all the crime types as a dict of sets.
import collections as pyco
cTypes = pyco.defaultdict(set)

for x in mList:
    cTypes[x['parent_incident_type']].add(x['incident_type_primary'])
    
for k in cTypes.keys():
    print(k, ': ', sep='', end='')
    for x in cTypes[k]:
        print(x, end=', ')
    print('\n', end='')


In [None]:
## let's work out the percentages of the parent types and incident types, respectively, and make them into
## pie charts.

cCount = pyco.defaultdict(int)
for x in mList:
    cCount[x['parent_incident_type']] += 1
    cCount[(x['parent_incident_type'], x['incident_type_primary'])] += 1
    
for k,v in cCount.items():
    if isinstance(k, str):
        print(k, v)
print("\n")

## and let's print out all the Traffic types
for k,v in cCount.items():
    if isinstance(k, tuple) and k[0]=='Traffic':
        print(k, v)
        

In [None]:
## let's make some pie charts. First the primary crime types.

pct = [100*v/len(mList) for k,v in cCount.items() if isinstance(k, str)]
lab = [k for k,v in cCount.items() if isinstance(k,str)]
print(len(pct))

import matplotlib.pyplot as plt
with plt.xkcd(): ## this enables the xkcd style.
    plt.pie(pct, labels=lab, autopct='%1.1f%%')

In [None]:
## let's make some pie charts. Let's do another for traffic

pct = [100*v/cCount['Traffic'] for k,v in cCount.items() if isinstance(k, tuple) and k[0]=='Traffic']
lab = [k[1] for k,v in cCount.items() if isinstance(k,tuple) and k[0]=='Traffic']
print(len(pct))

import matplotlib.pyplot as plt
with plt.xkcd(): ## this enables the xkcd style.
    plt.pie(pct, labels=lab, autopct='%1.1f%%')

In [None]:
import folium as fo
from folium import plugins as fpl

## heat plot of all driving complaints

hdat = [[x['location'][0], x['location'][1], 0.02] for x in mList \
        if x['incident_type_primary'] == 'DRIVING COMPLAINTS']

mapa = fo.Map([48.4323, -123.3720], tiles='Stamen Terrain', zoom_start=13)
mapa.add_child(fpl.HeatMap(hdat))
mapa
