# CrimeInUSbyMetropolitanArea Prep

# Setup

## Imports

In [2]:
import pandas as pd
import numpy as np
import re

## Parameters

In [3]:
LOCAL_MSA_CRIME_DATASET = "../../../data/RQ3/raw/CrimeInUSbyMetropolitanArea.xls"
PROCESSED_MSA_CRIME_DATASET = "../../../data/RQ3/processed/msa_crime_df.csv"

## Configuration

# Loading in Crime In US by MSA Data

In [4]:
#skipfooter takes away foot notes and Puerto Rico Territories which are unwanted 
df = pd.read_excel(LOCAL_MSA_CRIME_DATASET, header=3, skipfooter=39)
df.head()

Unnamed: 0,Metropolitan Statistical Area,Counties/principal cities,Population,Violent\ncrime,Murder and\nnonnegligent\nmanslaughter,Rape1,Robbery,Aggravated\nassault,Property\ncrime,Burglary,Larceny-\ntheft,Motor\nvehicle\ntheft
0,"Abilene, TX M.S.A.2",,170417.0,,,,,,,,,
1,,"Includes Callahan, Jones, and Taylor Counties",,,,,,,,,,
2,,City of Abilene,122480.0,591.0,8.0,97.0,104.0,382.0,3528.0,734.0,2561.0,233.0
3,,Total area actually reporting,1.0,670.0,9.0,109.0,112.0,440.0,4025.0,910.0,2846.0,269.0
4,,"Rate per 100,000 inhabitants",,393.2,5.3,64.0,65.7,258.2,2361.9,534.0,1670.0,157.8


## Initial Processing

In [5]:
#select key columns 

key_cols = ['Metropolitan Statistical Area', 
            'Counties/principal cities',
            'Population', 
            'Murder and\nnonnegligent\nmanslaughter']

df = df[key_cols]
df.columns

Index(['Metropolitan Statistical Area', 'Counties/principal cities',
       'Population', 'Murder and\nnonnegligent\nmanslaughter'],
      dtype='object')

In [6]:
#Rename columns
df = df.rename(columns = {"Metropolitan Statistical Area": "MSA", "Counties/principal cities": "County/City","Violent\ncrime": "Violent Crime", "Murder and\nnonnegligent\nmanslaughter": "Murder and nonnegligent manslaughter"})
df.columns

Index(['MSA', 'County/City', 'Population',
       'Murder and nonnegligent manslaughter'],
      dtype='object')

## Remove superscript remains from MSA Column

In [7]:
df['MSA'] = df['MSA'].str.replace('\d+', '', regex=True)
df['MSA'][df['MSA'].notnull()].tolist()

['Abilene, TX M.S.A.',
 'Akron, OH M.S.A.',
 'Albany, GA M.S.A.',
 'Albany-Schenectady-Troy, NY M.S.A.',
 'Albuquerque, NM M.S.A.',
 'Alexandria, LA M.S.A.',
 'Allentown-Bethlehem-Easton, PA-NJ M.S.A.',
 'Altoona, PA M.S.A.',
 'Amarillo, TX M.S.A.',
 'Anchorage, AK M.S.A.',
 'Ann Arbor, MI M.S.A.',
 'Anniston-Oxford, AL M.S.A.',
 'Appleton, WI M.S.A.',
 'Asheville, NC M.S.A.',
 'Atlanta-Sandy Springs-Alpharetta, GA M.S.A.',
 'Atlantic City-Hammonton, NJ M.S.A.',
 'Auburn-Opelika, AL M.S.A',
 'Augusta-Richmond County, GA-SC M.S.A.',
 'Austin-Round Rock-Georgetown, TX M.S.A.',
 'Bakersfield, CA M.S.A.',
 'Baltimore-Columbia-Towson, MD M.S.A.',
 'Bangor, ME M.S.A.',
 'Barnstable Town, MA M.S.A.',
 'Baton Rouge, LA M.S.A.',
 'Battle Creek, MI M.S.A',
 'Bay City, MI M.S.A.',
 'Beaumont-Port Arthur, TX M.S.A.',
 'Beckley, WV M.S.A',
 'Bellingham, WA M.S.A',
 'Bend, OR M.S.A.',
 'Billings, MT M.S.A.',
 'Binghamton, NY M.S.A.',
 'Bismarck, ND M.S.A.',
 'Blacksburg-Christiansburg, VA M.S.A.',
 

# Saving the Dataframe

In [8]:
df.to_csv(PROCESSED_MSA_CRIME_DATASET, index=False)