# What Dis

2010–2016 crime data for cities throughout the United States as curated by the FBI

[source](https://ucr.fbi.gov/crime-in-the-u.s/)

# Imports and Definitions

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def to_val(cell):
    val = cell.get_text().strip().replace(',', '')
    return float(val) if val != '' else np.nan

In [4]:
typos = { # good job, FBI
    "massachuetts": "massachusetts"
}

In [5]:
years = list(range(2010, 2016+1))

# Scraping

In [6]:
data = []
with requests.Session() as session:
    for year in years:
        print("scraping {year} data for".format(year=year), end=' ')
        base_url = "https://ucr.fbi.gov/crime-in-the-u.s/{year}/crime-in-the-u.s.-{year}".format(year=year)
        states_list_url = BeautifulSoup(session.get(base_url).content, 'html.parser') \
                              .find(id='quick_finds')                                 \
                              .find_next('div')                                       \
                              .find_next('a')                                         \
                              .get('href')
        state_anchors = BeautifulSoup(session.get(states_list_url).content, 'html.parser') \
                              .find_all(id='page_content')[-1]                             \
                              .find(class_='secardarywrapper')                             \
                              .find_all('a')
        for state_anchor in state_anchors:
            state = state_anchor.get_text().strip().lower()
            if state in typos:
                state = typos[state]
            print(state, end=', ')
            state_url = state_anchor.get('href')

            table_body = BeautifulSoup(session.get(state_url).content, 'html.parser') \
                              .find_all('table', class_='data')[-1]                   \
                              .find('tbody')
            if year in [2010, 2011, 2012]:
                data.append(pd.DataFrame({
                    'state': state,
                    'city': [cell.get_text().strip().rstrip("0123456789").lower()
                             for cell in table_body.find_all(class_='group0')],
                    'year': year,
                    'population': [to_val(cell)
                             for cell in table_body.find_all(class_='group1')],
                    'murder': [to_val(cell)
                             for cell in table_body.find_all(class_='group3')],
                    'forcible rape': [to_val(cell)
                             for cell in table_body.find_all(class_='group4')],
                    'robbery': [to_val(cell)
                             for cell in table_body.find_all(class_='group5')],
                    'aggravated assault': [to_val(cell)
                             for cell in table_body.find_all(class_='group6')],
                    'burglary': [to_val(cell)
                             for cell in table_body.find_all(class_='group8')],
                    'larceny or theft': [to_val(cell)
                             for cell in table_body.find_all(class_='group9')],
                    'motor vehicle theft': [to_val(cell)
                             for cell in table_body.find_all(class_='group10')],
                    'arson': [to_val(cell)
                             for cell in table_body.find_all(class_='group11')]
                }))
            else: # the FBI revised their definition of rape in 2013, and added an extra column to the table
                year_data = pd.DataFrame({
                    'state': state,
                    'city': [cell.get_text().strip().rstrip("0123456789").lower()
                             for cell in table_body.find_all(class_='group0')],
                    'year': year,
                    'population': [to_val(cell)
                             for cell in table_body.find_all(class_='group1')],
                    'murder': [to_val(cell)
                             for cell in table_body.find_all(class_='group3')],
                    'rape (revised)': [to_val(cell)
                             for cell in table_body.find_all(class_='group4')],
                    'rape (legacy)': [to_val(cell)
                             for cell in table_body.find_all(class_='group5')],
                    'robbery': [to_val(cell)
                             for cell in table_body.find_all(class_='group6')],
                    'aggravated assault': [to_val(cell)
                             for cell in table_body.find_all(class_='group7')],
                    'burglary': [to_val(cell)
                             for cell in table_body.find_all(class_='group9')],
                    'larceny or theft': [to_val(cell)
                             for cell in table_body.find_all(class_='group10')],
                    'motor vehicle theft': [to_val(cell)
                             for cell in table_body.find_all(class_='group11')],
                    'arson': [to_val(cell)
                             for cell in table_body.find_all(class_='group12')]
                })
                year_data['forcible rape'] = pd.Series([legacy if pd.isnull(revised) else revised
                                                        for revised, legacy
                                                        in zip(
                                                            year_data['rape (revised)'].values,
                                                            year_data['rape (legacy)'].values)],
                                                       index=year_data.index)
                year_data.drop(['rape (revised)', 'rape (legacy)'], axis=1, inplace=True)
                data.append(year_data)
        print('\n')

df = pd.concat(data, ignore_index=True).set_index(['state', 'city', 'year']).sort_index()

scraping 2010 data for alabama, alaska, arizona, arkansas, california, colorado, connecticut, delaware, district of columbia, florida, georgia, hawaii, idaho, illinois, indiana, iowa, kansas, kentucky, louisiana, maine, maryland, massachusetts, michigan, minnesota, mississippi, missouri, montana, nebraska, nevada, new hampshire, new jersey, new mexico, new york, north carolina, north dakota, ohio, oklahoma, oregon, pennsylvania, rhode island, south carolina, south dakota, tennessee, texas, utah, vermont, virginia, washington, west virginia, wisconsin, wyoming, 

scraping 2011 data for alabama, alaska, arizona, arkansas, california, colorado, connecticut, delaware, district of columbia, florida, georgia, idaho, illinois, indiana, iowa, kansas, kentucky, louisiana, maine, maryland, massachusetts, michigan, minnesota, mississippi, missouri, montana, nebraska, nevada, new hampshire, new jersey, new mexico, new york, north carolina, north dakota, ohio, oklahoma, oregon, pennsylvania, rhode 

In [7]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aggravated assault,arson,burglary,forcible rape,larceny or theft,motor vehicle theft,murder,population,robbery
state,city,year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
alabama,abbeville,2010,20.0,,19.0,0.0,45.0,3.0,1.0,2950.0,0.0
alabama,abbeville,2011,10.0,1.0,12.0,1.0,51.0,2.0,0.0,2701.0,2.0
alabama,abbeville,2012,18.0,,16.0,1.0,38.0,3.0,0.0,2715.0,3.0
alabama,abbeville,2013,7.0,,21.0,1.0,39.0,3.0,1.0,2645.0,2.0
alabama,abbeville,2014,7.0,,22.0,1.0,52.0,11.0,0.0,2643.0,0.0
alabama,abbeville,2015,6.0,,24.0,3.0,50.0,2.0,0.0,2610.0,0.0
alabama,abbeville,2016,10.0,,12.0,1.0,34.0,5.0,0.0,2608.0,0.0
alabama,adamsville,2010,5.0,,21.0,3.0,250.0,23.0,0.0,4796.0,2.0
alabama,adamsville,2011,26.0,2.0,61.0,2.0,226.0,13.0,2.0,4544.0,7.0
alabama,adamsville,2012,20.0,,49.0,1.0,223.0,20.0,0.0,4543.0,7.0


In [8]:
df.to_pickle("fbi_crime_data.pkl")