### This code parse crime data for all 50 states from disaster center crime (https://www.disastercenter.com/crime/). The process includes parsing, raw data cleaning, effective data extractin, data cleaning (remove syntax, filter out Nans, data type convert), and merge to master sheet 

### The Key for the data set is State (In upper 2 letters state code, e.g. 'AL', 'CA')

### Powered by requestsm BeautifulSoup for parsing

In [161]:
import requests
from bs4 import BeautifulSoup
import string
import pandas as pd
import re

In [162]:
def fetch_and_parse_content(url):
    # fetch content from url
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print("Error: Unable to fetch content")
        return None

def parse_html_to_dataframe(html_content): # used for 1960 - 2009
    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    rows = soup.find_all('tr')

    # Extract data from rows
    data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [clean_value(col.text.strip()) for col in cols]
        #prin(cols)
        data.append(cols)

    # Remove any empty rows
    data = [row for row in data if len(row) > 0]
    #print(data)

    # Column names
    column_names = ['Year', 'Population', 'Index', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary', 'Larceny-Theft', 'Vehicle Theft']

    # Create DataFrame
    try:
        df = pd.DataFrame(data, columns=column_names)
    except:
        print(data)

    # Remove the first row, which contains the column names
    df = df.iloc[1:]

    # Drop rows with missing values
    df = df.dropna()

    return df

def clean_value(value):
    return re.sub(r'\s*,\s*', ',', value)

def parse_html_to_dataframe_v2(html_content): # this is used for 2010 -2019 because of the different format
    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    rows = soup.find_all('tr')

    # Extract data from rows
    """
    data = []
    for row in rows:
        cols = row.find_all(['td', 'small'])
        cols = [clean_value(col.text.strip()) for col in cols]
        # Remove duplicates in each row
        cols = cols[::2]
        data.append(cols)

    # Remove any empty rows
    data = [row for row in data if len(row) > 0]

    # Column names
    column_names = ['Year', 'Population', 'Index', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary', 'Larceny-Theft', 'Vehicle Theft']

    # Create DataFrame
    try:
        df = pd.DataFrame(data, columns=column_names)
    except:
        for d in data:
            print(len(d))
        print(data)
    """

    data = []
    for row in rows:
        cols = row.find_all(['td'])
        cleaned_cols = []
        for col in cols:
            innermost = col
            while innermost.find(['small', 'big']):
                innermost = innermost.find(['small', 'big'])
            cleaned_cols.append(clean_value(innermost.text.strip()))
        data.append(cleaned_cols)

    df = pd.DataFrame(data)

    # Remove the first row, which contains the column names
    df = df.iloc[1:]

    # Drop rows with missing values
    df = df.dropna()

    return df

def extract_effect_parts(html_content):
    all_content = str(html_content)
    effect_parts = all_content.split("""</tr>
              </tbody>
            </table>
            </center>
            </td>
          </tr>
          <tr>
            <td style="text-align: center;">
            <table style="text-align: left; width: 100%;" border="0"
 cellpadding="0" cellspacing="0">
              <tbody>
                <tr>""")[0]
    #print(effect_parts)
    try:
        effect_parts = effect_parts.split("""Number of Crimes""")[1]
    except:
        try: 
            effect_parts = effect_parts.split("""Population and Number""")[1]
        except:
            try:
                effect_parts = effect_parts.split("""and Number of""")[1]
            except:
                try: 
                    effect_parts = effect_parts.split("""Number of""")[1]
                except:
                    print("Error: Unable to parse content")
    effect_parts = effect_parts.split("""Forcible""")
    return effect_parts

def create_crime_dataframe(effect_parts):
    crime_master = pd.DataFrame(columns=['Year', 'Population', 'Index', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary', 'Larceny-Theft', 'Vehicle Theft'])
    
    for i in range(1, 6):
        temp_df = parse_html_to_dataframe(effect_parts[i])
        # Add temp_df to master sheet
        crime_master = pd.concat([crime_master, temp_df])

    # Process 2010-2019 because of different format
    temp_df = parse_html_to_dataframe_v2(effect_parts[6].split('</tbody>')[0])
    temp_df.columns = ['Year', 'Population', 'Index', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary', 'Larceny-Theft', 'Vehicle Theft']
    crime_master = pd.concat([crime_master,temp_df])
   
    return crime_master

def remove_punctuation(s):
    return s.translate(str.maketrans('', '', string.punctuation))


### Define All State Lowercase Abbreviations For Scraping All Data

In [142]:

state_abbs = [
    'al', 'ak', 'az', 'ar', 'ca', 'co', 'ct', 'de', 'fl', 'ga',
    'hi', 'id', 'il', 'in', 'ia', 'kn', 'ky', 'la', 'me', 'md',
    'ma', 'mi', 'mn', 'ms', 'mo', 'mt', 'ne', 'nv', 'nh', 'nj',
    'nm', 'ny', 'nc', 'nd', 'oh', 'ok', 'or', 'pa', 'ri', 'sc',
    'sd', 'tn', 'tx', 'ut', 'vt', 'va', 'wa', 'wv', 'wi', 'wy'
]



### Main part

In [169]:
master = pd.DataFrame(columns=['Year', 'State', 'Population', 'Index', 'Violent', 'Property', 'Murder', 'Rape', 'Robbery', 'Aggravated Assault', 'Burglary', 'Larceny-Theft', 'Vehicle Theft'])
for s in state_abbs:
    print(s)
    if s in ["ms","mo","mt","ne","nj","nm","nc","nd","ok"]:
        url = "https://www.disastercenter.com/crime/{state}crimn.htm".format(state = s)
    else:
        url = "https://www.disastercenter.com/crime/{state}crime.htm".format(state = s)
    content = fetch_and_parse_content(url)
    effect_content = extract_effect_parts(content)
    to_add = create_crime_dataframe(effect_content)
    
    to_add["State"] = s.upper()
    #print(to_add.columns)
    master = pd.concat([master,to_add],ignore_index=True)

for col in master.columns:
    if col == "State":
        continue
    master[col] = master[col].apply(remove_punctuation).astype(int)
master.sample(30)

al
ak
az
ar
ca
co
ct
de
fl
ga
hi
id
il
in
ia
kn
ky
la
me
md
ma
mi
mn
ms
mo
mt
ne
nv
nh
nj
nm
ny
nc
nd
oh
ok
or
pa
ri
sc
sd
tn
tx
ut
vt
va
wa
wv
wi
wy


Unnamed: 0,Year,State,Population,Index,Violent,Property,Murder,Rape,Robbery,Aggravated Assault,Burglary,Larceny-Theft,Vehicle Theft
1978,1962,ND,642000,6127,128,5999,8,27,40,53,1217,4246,536
37,1997,AL,4319000,211188,24379,186809,426,1396,6931,15626,43786,127616,15407
2311,1995,RI,990000,42021,3643,38378,33,267,914,2429,9234,24780,4364
2087,2011,OH,11541007,415790,35218,380572,500,3679,15991,15048,112901,246744,20927
1333,1972,MN,3896000,130674,6798,123876,95,571,3290,2842,36124,74612,13140
499,1978,FL,8594000,607552,65792,541760,949,3960,17701,43182,170061,338299,33400
936,1995,KN,2565000,125350,10792,114558,159,938,2775,6920,27404,78855,8299
2608,1992,UT,1813000,102589,5267,97322,54,823,1014,3376,16045,76964,4313
185,1964,AR,1933000,26274,2641,23633,147,157,565,1772,7135,14622,1876
1277,1976,MI,9104000,589779,58814,530965,1014,3287,30284,24229,151901,323243,55821


In [170]:
master.to_csv("./crime_master.csv")