# ALY2100 Assignment 2 - Boston - CRIME INCIDENT REPORTS - 2021

In [1]:
import pandas as pd

In [2]:
# Public URLs for the data
offence_codes = 'https://data.boston.gov/dataset/6220d948-eae2-4e4b-8723-2dc8e67722a3/resource/3aeccf51-a231-4555-ba21-74572b4c33d6/download/rmsoffensecodes.xlsx'
crime_reports ='https://data.boston.gov/dataset/6220d948-eae2-4e4b-8723-2dc8e67722a3/resource/3d818157-6e9b-4fa5-86de-436ca663d88e/download/tmp2u8p7cki.csv'

In [3]:
# Read the data (xlsx files require custom engine to load)
# Convert the dataframe to a dictionary in the form of {CODE: NAME}
offence_code_map = pd.read_excel(offence_codes, engine='openpyxl').set_index('CODE')['NAME'].to_dict()

In [4]:
# Read the crime incident data
# The INCIDENT_NUMBER column contains strings and integer values.  Explicitly specifying the dtype to suppress waring.
df = pd.read_csv(crime_reports, dtype={'INCIDENT_NUMBER': str})
df['OFFENSE'] = df['OFFENSE_CODE'].map(offence_code_map)
data = df.to_dict(orient='records')

In [5]:
def concat_day_and_offense(d):
    """
    Concatenate the day of week and offense
    :param d: Dictionary containing DAY_OF_WEEK & OFFENSE keys
    :return: String concatenation of day of the week and offense, 
        N/A is returned if the expected keys are missing
    """
    result = 'N/A'
    if 'DAY_OF_WEEK' in d.keys() and 'OFFENSE' in d.keys():
        result = f"{d.get('DAY_OF_WEEK')} - {d.get('OFFENSE')}"
    return result

In [6]:
# Leverage the map function to apply the concat_day_and_offense function to the data iterable dict
res = list(map(concat_day_and_offense, data))

In [7]:
def compute_summary(data, ntop=5):
    """
    Compute summary
    :param data: List of day of the week and offense values representing the number of occurances
    :param ntop: Number of top values to return based on descending number of occurances
    :return: Tuple of (ntop_summary, summary) where both values are dictionaries in the for of {DAY_OFFENSE: COUNT}
    """
    summary = {}
    # Iterate over the unique values contained in the list,
    # adding the total count of each item to the summary dict
    for v in list(set(data)):
        summary[v] = data.count(v)
    # Sort the summary dict by values, descending
    summary = sorted(summary.items(), key=lambda x: x[1], reverse=True)
    # Setup the return variables - ntop summary dict and total summary
    ntop_summary = dict(summary[:ntop])
    summary = dict(summary)
    return ntop_summary, summary

In [8]:
ntop_summary, summary = compute_summary(data=res, ntop=5)
ntop_summary

{'Friday - VERBAL DISPUTE': 1680,
 'Saturday - INVESTIGATE PERSON': 1523,
 'Friday - INVESTIGATE PERSON': 1523,
 'Sunday - VERBAL DISPUTE': 1249,
 'Monday - INVESTIGATE PERSON': 1240}

### Below is the output of the equivalent operation performed directly on the dataframe

In [9]:
df['DAY_AND_OFFENSE'] = df['DAY_OF_WEEK'] + ' - ' + df['OFFENSE']

In [10]:
df.groupby('DAY_AND_OFFENSE')['INCIDENT_NUMBER'].count().sort_values(ascending=False).head(5).to_dict()

{'Friday - VERBAL DISPUTE': 1680,
 'Friday - INVESTIGATE PERSON': 1523,
 'Saturday - INVESTIGATE PERSON': 1523,
 'Sunday - VERBAL DISPUTE': 1249,
 'Monday - INVESTIGATE PERSON': 1240}