# What Dis

2010–2016 crime data varaious cities throughout the United States as curated by some random website.

[source](http://www.city-data.com/crime/)

# Imports and Definitions

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
def to_val(cell):
    val = cell.get_text().split('(')[0].strip().replace(',', '')
    return float(val) if val != 'N/A' else np.nan

In [4]:
abbreviations = {
    "Alabama"       : "AL",
    "Alaska"        : "AK",
    "Arizona"       : "AZ",
    "Arkansas"      : "AR",
    "California"    : "CA",
    "Colorado"      : "CO",
    "Connecticut"   : "CT",
    "Delaware"      : "DE",
    "Florida"       : "FL",
    "Georgia"       : "GA",
    "Hawaii"        : "HI",
    "Idaho"         : "ID",
    "Illinois"      : "IL",
    "Indiana"       : "IN",
    "Iowa"          : "IA",
    "Kansas"        : "KS",
    "Kentucky"      : "KY",
    "Louisiana"     : "LA",
    "Maine"         : "ME",
    "Maryland"      : "MD",
    "Massachusetts" : "MA",
    "Michigan"      : "MI",
    "Minnesota"     : "MN",
    "Mississippi"   : "MS",
    "Missouri"      : "MO",
    "Montana"       : "MT",
    "Nebraska"      : "NE",
    "Nevada"        : "NV",
    "New Hampshire" : "NH",
    "New Jersey"    : "NJ",
    "New Mexico"    : "NM",
    "New York"      : "NY",
    "North Carolina": "NC",
    "North Dakota"  : "ND",
    "Ohio"          : "OH",
    "Oklahoma"      : "OK",
    "Oregon"        : "OR",
    "Pennsylvania"  : "PA",
    "Rhode Island"  : "RI",
    "South Carolina": "SC",
    "South Dakota"  : "SD",
    "Tennessee"     : "TN",
    "Texas"         : "TX",
    "Utah"          : "UT",
    "Vermont"       : "VT",
    "Virginia"      : "VA",
    "Washington"    : "WA",
    "West Virginia" : "WV",
    "Wisconsin"     : "WI",
    "Wyoming"       : "WY",
}

In [5]:
places = {
    "Alabama": [
        "Birmingham",
        "Montgomery",],
    "Alaska": [
        "Anchorage",
        "Juneau",],
    "Arizona": [
        "Phoenix",
        "Tucson",],
    "Arkansas": [
        "Little Rock",
        "Fort Smith",],
    "California": [
        "San Diego",
        "Los Angeles",
        "Sacramento",],
    "Colorado": [
        "Denver",
        "Colorado Springs",
        "Aurora",],
    "Connecticut": [
        "Bridgeport",
        "Hartford",],
    "Delaware": [
        "Dover",
        "Wilmington",],
    "Florida": [
        "Jacksonville",
        "Miami",
        "Tallahassee",],
    "Georgia": [
        "Atlanta",
        "Savannah",],
    "Hawaii": [
        "Honolulu",
        "Kuai",
        "Maui",],
    "Idaho": [
        "Boise",
        "Meridian",],
    "Illinois": [
        "Chicago",
        "Springfield",],
    "Indiana": [
        "Fort Wayne",
        "Indianapolis",],
    "Iowa": [
        "Cedar Rapids",
        "Des Moines",],
    "Kansas": [
        "Topeka",
        "Wichita",],
    "Kentucky": [
        "Frankfort",
        "Louisville",],
    "Louisiana": [
        "Baton Rouge",
        "New Orleans",],
    "Maine": [
        "Augusta",
        "Portland",],
    "Maryland": [
        "Baltimore",
        "Annapolis",],
    "Massachusetts": [
        "Boston",
        "Worcester",],
    "Michigan": [
        "Detroit",
        "Lansing",],
    "Minnesota": [
        "Minneapolis",
        "Saint Paul",],
    "Mississippi": [
        "Jackson",
        "Gulfport",],
    "Missouri": [
        "Kansas City",
        "Jefferson City",],
    "Montana": [
        "Billings",
        "Helena",],
    "Nebraska": [
        "Lincoln",
        "Omaha",],
    "Nevada": [
        "Carson City",
        "Las Vegas",
        "Reno",],
    "New Hampshire": [
        "Concord",
        "Manchester",],
    "New Jersey": [
        "Newark",
        "Trenton",],
    "New Mexico": [
        "Albuquerque",
        "Santa Fe",],
    "New York": [
        "Albany",
        "New York City",],
    "North Carolina": [
        "Charlotte",
        "Raleigh",],
    "North Dakota": [
        "Bismarck",
        "Fargo",],
    "Ohio": [
        "Columbus",
        "Cleveland",],
    "Oklahoma": [
        "Oklahoma City",
        "Tulsa",],
    "Oregon": [
        "Portland",
        "Salem",],
    "Pennsylvania": [
        "Harrisburg",
        "Philadelphia",],
    "Rhode Island": [
        "Providence",
        "Warwick",],
    "South Carolina": [
        "Charleston",
        "Columbia",],
    "South Dakota": [
        "Pierre",
        "Sioux Falls",],
    "Tennessee": [
        "Nashville",
        "Memphis",],
    "Texas": [
        "Austin",
        "Houston",],
    "Utah": [
        "Salt Lake City",
        "West Valley City",],
    "Vermont": [
        "Burlington",
        "Montpelier",],
    "Virginia": [
        "Richmond",
        "Virginia Beach",],
    "Washington": [
        "Olympia",
        "Seattle",],
    "West Virginia": [
        "Charleston",
        "Huntington",],
    "Wisconsin": [
        "Madison",
        "Milwaukee",],
    "Wyoming": [
        "Casper",
        "Cheyenne",],
}

In [6]:
corrections = { # little quirks
    "Boise": "Boise City",
    "Saint Paul": "St. Paul",
    "New York City": "New York",
}

# Scraping

In [7]:
data = []
with requests.Session() as session:
    for state, cities in places.items():
        print("scraping data for {state}:".format(state=state), end=' ')
        for city in cities:
            print(city, end=', ')
            if city in corrections:
                city = corrections[city]
            state_url = "http://www.city-data.com/crime/crime-{city}-{state}.html" \
                        .format(city=city.replace(' ', '-'),
                                state=state.replace(' ', '-'))
            city_soup = BeautifulSoup(session.get(state_url).content, 'html.parser')
            if city_soup.find(id="errormsg"):
                print("unable to find {city}, {state_abbr}; skipping"
                      .format(city=city,
                              state_abbr=abbreviations[state]))
                continue
            table = city_soup.find(id="crimeTab")
            city_data = []
            city_data.append([cell.find('h4').get_text()
                              for cell
                              in table.find('thead').find_all('tr')[-1].find_all('th')][1:])
            for row in table.find('tbody').find_all('tr')[:-1]: # ignore footer
                city_data.append([to_val(cell)
                                  for cell
                                  in row.find_all('td')[1:]])
            data.append(pd.DataFrame({
                'state': state,
                'city': city,
                'year': city_data[0],
                'murder': city_data[1],
                'rape': city_data[2],
                'robbery': city_data[3],
                'assault': city_data[t4],
                'burglary': city_data[5],
                'theft': city_data[6],
                'vehicle theft': city_data[7],
                'arson': city_data[8],
            }))
        print()
        
df = pd.concat(data, ignore_index=True).set_index(['state', 'city', 'year']).sort_index()

scraping data for Alabama: Birmingham, 

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
df

In [None]:
df.to_pickle("citydata_crime_data.pkl")