In [1]:
import pandas as pd
import numpy as np

In [2]:
crimeDF = pd.read_csv("FBI_2013_Hate_Crime_Data.csv")

In [3]:
crimeDF.sample(n=5, random_state=1)

Unnamed: 0,State,Agency type,Agency name,Race,Religion,Sexual orientation,Ethnicity,Disability,Gender,Gender Identity,1st quarter,2nd quarter,3rd quarter,4th quarter,Population
258,California,Metropolitan Counties,Santa Cruz,1,0,1,1,0,0,0.0,2.0,0.0,1.0,0.0,
984,New_Jersey,Cities,Brick Township,2,1,0,1,0,0,0.0,1.0,2.0,1.0,0.0,75371.0
922,Missouri,Cities,Raytown,1,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,29501.0
351,Connecticut,Cities,Plainville,0,0,1,0,0,0,0.0,,,,1.0,17850.0
231,California,Universities and Colleges,California State Polytechnic University: San L...,0,0,1,0,0,0,0.0,1.0,0.0,0.0,0.0,18679.0


We can see Metropolitan Counties and Nonmetropolitan Counties has NaN in their Population column.

In [4]:
crimeDF.loc[crimeDF['Agency type']==('Metropolitan Counties')].sample(n=5, random_state=1)

Unnamed: 0,State,Agency type,Agency name,Race,Religion,Sexual orientation,Ethnicity,Disability,Gender,Gender Identity,1st quarter,2nd quarter,3rd quarter,4th quarter,Population
616,Louisiana,Metropolitan Counties,Calcasieu,0,4,3,0,0,0,0.0,4.0,1.0,1.0,1.0,
315,Colorado,Metropolitan Counties,Pueblo,1,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,
316,Colorado,Metropolitan Counties,Weld,1,0,1,0,0,0,0.0,1.0,0.0,1.0,0.0,
503,Iowa,Metropolitan Counties,Mills,1,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,
860,Michigan,Metropolitan Counties,Oakland,7,2,5,3,0,0,0.0,5.0,4.0,5.0,3.0,


In [5]:
crimeDF.loc[crimeDF['Agency type']==('Nonmetropolitan Counties')].sample(n=5, random_state=1).sample(n=5, random_state=1)

Unnamed: 0,State,Agency type,Agency name,Race,Religion,Sexual orientation,Ethnicity,Disability,Gender,Gender Identity,1st quarter,2nd quarter,3rd quarter,4th quarter,Population
534,Kansas,Nonmetropolitan Counties,Greenwood,1,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,
1636,Utah,Nonmetropolitan Counties,Carbon,3,0,0,0,0,0,0.0,1.0,0.0,2.0,0.0,
913,Minnesota,Nonmetropolitan Counties,Rice,1,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,
1802,West_Virginia,Nonmetropolitan Counties,Harrison,1,1,2,0,0,0,0.0,0.0,0.0,2.0,2.0,
1824,Wisconsin,Nonmetropolitan Counties,Oneida,1,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0,


We will fill the NaN value in the crimeDF with information we got from another file contains population for counties.

In [6]:
countyDF = pd.read_csv("county_population.csv")

we can see the 'link' column is not cleaned, we only want to keep the county name.

In [7]:
countyDF.sample(n=5, random_state=1)

Unnamed: 0,link,State,pop2016,pop2010,GrowthRate
80,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Massachusetts,785205,745422,0.05337
84,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Georgia,755754,689595,0.095939
33,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Ohio,1248514,1278200,-0.023225
81,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",California,771410,719899,0.071553
93,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Missouri,698895,674804,0.035701


Clean the link column and store the result in a listed called link_clean.

In [8]:
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')

In [9]:
clean_link= []
for element in countyDF['link']:
    element = element.split(",")[4].split(':')[1]
    element = ''.join(filter(whitelist.__contains__, element))
    clean_link.append(element)

In [10]:
countyDF['clean_link'] = clean_link

In [11]:
countyDF.sample(n=5, random_state=1)

Unnamed: 0,link,State,pop2016,pop2010,GrowthRate,clean_link
80,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Massachusetts,785205,745422,0.05337,Essex County
84,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Georgia,755754,689595,0.095939,Cobb County
33,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Ohio,1248514,1278200,-0.023225,Cuyahoga County
81,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",California,771410,719899,0.071553,San Mateo County
93,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Missouri,698895,674804,0.035701,Jackson County


Because our crime records is in 2013, so we will generate the population of each county in 2013 with their population in 2010 and the growth rate of their population.

In [12]:
pop2013 = []
for index, row in countyDF.iterrows():
    result = row['pop2010']*(1 + float(row['GrowthRate']))**3
    pop2013.append(int(result))

In [13]:
countyDF['pop2013'] = pop2013

In [14]:
countyDF.sample(n=5, random_state=1)

Unnamed: 0,link,State,pop2016,pop2010,GrowthRate,clean_link,pop2013
80,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Massachusetts,785205,745422,0.05337,Essex County,871253
84,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Georgia,755754,689595,0.095939,Cobb County,907722
33,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Ohio,1248514,1278200,-0.023225,Cuyahoga County,1191194
81,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",California,771410,719899,0.071553,San Mateo County,885753
93,"{""type"":""a"",""key"":null,""ref"":null,""props"":{""hr...",Missouri,698895,674804,0.035701,Jackson County,749687


We only need to keep the columns clean_link and pop2013.

In [15]:
countyDF = countyDF[['clean_link','pop2013']]

In [16]:
countyDF.sample(n=5, random_state=1)

Unnamed: 0,clean_link,pop2013
80,Essex County,871253
84,Cobb County,907722
33,Cuyahoga County,1191194
81,San Mateo County,885753
93,Jackson County,749687


Fill the NaN in population column in crimeDf with countyDF.

In [17]:
d = {}
for index, row in countyDF.iterrows():
    d[row['clean_link'].rsplit(' ', 1)[0]] = row['pop2013']

In [18]:
for index, row in crimeDF.iterrows():
    if type(row['Population']) is str:
        crimeDF.at[index, 'Population'] = row['Population'].replace(',', '')
    if row['Agency name'] in d and type(row['Population']) is float:
        crimeDF.at[index, 'Population'] = d[row['Agency name']]

There are still some nan values in the poplation column for thoes counties that are not in our county file. For this project we will only keep the rows that doesn't have NaN in the population column.

In [19]:
for element in crimeDF:
    if type(element) is str:
        element= element.replace(',', '')
crimeDF['Population'] = crimeDF['Population'].astype(float)
crimeDF = crimeDF[np.isfinite(crimeDF['Population'])]

In [20]:
crimeDF['Population'].isna().sum()

0

There are also NaN values in other columns in the crimeDF. But from observe the dataframe, we find out these NaN values is just when the number of crime is 0(I guess sometimes they decide to leave the cell empty if no crime is reported). So we can just fill these NaNs with 0.

In [21]:
crimeDF.isna().sum()

State                  0
Agency type            0
Agency name            0
Race                   0
Religion               0
Sexual orientation     0
Ethnicity              0
Disability             0
Gender                 1
Gender Identity        1
1st quarter           25
2nd quarter           28
3rd quarter           17
4th quarter           36
Population             0
dtype: int64

In [22]:
crimeDF = crimeDF.fillna(0)

In [23]:
crimeDF.isna().sum()

State                 0
Agency type           0
Agency name           0
Race                  0
Religion              0
Sexual orientation    0
Ethnicity             0
Disability            0
Gender                0
Gender Identity       0
1st quarter           0
2nd quarter           0
3rd quarter           0
4th quarter           0
Population            0
dtype: int64

Create a new column to calculate the total number of crimes in each area.

In [24]:
total_crime = []

In [25]:
for index, row in crimeDF.iterrows():
    total = int(row[10]) + int(row[11]) + int(row[12]) + int(row[13])
    total_crime.append(total)

In [26]:
crimeDF['total_crime'] = total_crime

Create a new column called population_in_thounds to convert population to population with unit of 1000 people.

In [27]:
population_in_thounsands = []
for element in crimeDF['Population']:
    population_in_thounsands.append(element / 1000)

In [28]:
crimeDF['Population_in_thounsands'] = population_in_thounsands

In [29]:
crimeDF.head()

Unnamed: 0,State,Agency type,Agency name,Race,Religion,Sexual orientation,Ethnicity,Disability,Gender,Gender Identity,1st quarter,2nd quarter,3rd quarter,4th quarter,Population,total_crime,Population_in_thounsands
0,Alabama,Cities,Florence,2,0,0,0,0,0,0.0,0.0,1.0,0.0,1.0,39481.0,2,39.481
1,Alabama,Cities,Hoover,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,84139.0,1,84.139
2,Alabama,Cities,Prattville,2,0,0,0,0,0,0.0,1.0,0.0,1.0,0.0,35154.0,2,35.154
3,Alabama,Cities,Tuscaloosa,1,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,94126.0,1,94.126
4,Alaska,Cities,Anchorage,8,0,0,0,0,0,0.0,2.0,3.0,3.0,0.0,299455.0,8,299.455


Create a new column to calculate the hate crime rate of each area. Crime rate is calculated by: (number of crimes / population) * 100000. It means the number of crimes likely will happen in every hundred thousand people.

In [30]:
crime_rate = []

In [31]:
for index, row in crimeDF.iterrows():
    cr = (row['total_crime'] / row['Population']) * 100000
    crime_rate.append(cr)
crimeDF['hate_crime_rate'] = crime_rate

In [32]:
crimeDF.head()

Unnamed: 0,State,Agency type,Agency name,Race,Religion,Sexual orientation,Ethnicity,Disability,Gender,Gender Identity,1st quarter,2nd quarter,3rd quarter,4th quarter,Population,total_crime,Population_in_thounsands,hate_crime_rate
0,Alabama,Cities,Florence,2,0,0,0,0,0,0.0,0.0,1.0,0.0,1.0,39481.0,2,39.481,5.065728
1,Alabama,Cities,Hoover,0,0,1,0,0,0,0.0,0.0,1.0,0.0,0.0,84139.0,1,84.139,1.188509
2,Alabama,Cities,Prattville,2,0,0,0,0,0,0.0,1.0,0.0,1.0,0.0,35154.0,2,35.154,5.689253
3,Alabama,Cities,Tuscaloosa,1,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0,94126.0,1,94.126,1.062406
4,Alaska,Cities,Anchorage,8,0,0,0,0,0,0.0,2.0,3.0,3.0,0.0,299455.0,8,299.455,2.67152


In [33]:
crimeDF.drop(crimeDF.columns[[3,4,5,6,7,8,9,10,11,12,13]], axis = 1, inplace = True)

In [34]:
crimeDF['hate_crime_rate'].median()

6.285750229122293

The median of hate crime rate is around 6.3. Create a new column to indicate each area has high hate crime rate(above median) or low hate crime rate(below median). 

In [35]:
high_hate_crime_rate = []
for element in crimeDF['hate_crime_rate']:
    if element > 6.3:
        high_hate_crime_rate.append(1)
    else:
        high_hate_crime_rate.append(0)
crimeDF['high_hate_crime_rate'] = high_hate_crime_rate

Rearrange columns of crimeDF so the Y variable is at the begining of the dataframe.

In [36]:
crimeDF = crimeDF[['high_hate_crime_rate', 'State', 'Agency type', 'Agency name', 'Population', 'total_crime', 'Population_in_thounsands', 'hate_crime_rate']]

In [37]:
agency_type_dummy = pd.get_dummies(crimeDF['Agency type'])

In [38]:
state_dummy = pd.get_dummies(crimeDF['State'])

In [39]:
crimeDF = pd.concat([crimeDF, agency_type_dummy, state_dummy], axis=1,sort = True)

In [40]:
crimeDF.drop(['State', 'Agency type', 'Agency name'],axis = 1, inplace = True)

In [41]:
crimeDF.drop(['Population', 'total_crime', 'hate_crime_rate'],axis = 1, inplace = True)

In [42]:
crimeDF.head()

Unnamed: 0,high_hate_crime_rate,Population_in_thounsands,Cities,Metropolitan Counties,Nonmetropolitan Counties,Universities and Colleges,Alabama,Alaska,Arizona,Arkansas,...,South_Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West_Virginia,Wisconsin,Wyoming
0,0,39.481,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,84.139,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,35.154,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,94.126,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,299.455,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
X = crimeDF.drop('high_hate_crime_rate', axis = 1)
y = crimeDF['high_hate_crime_rate']

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
lm = LogisticRegression()

In [53]:
lm.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [55]:
predictions = lm.predict(X_test)

In [57]:
from sklearn.metrics import classification_report

In [59]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.67      0.72       249
           1       0.70      0.81      0.76       243

   micro avg       0.74      0.74      0.74       492
   macro avg       0.75      0.74      0.74       492
weighted avg       0.75      0.74      0.74       492



In [60]:
from sklearn.metrics import confusion_matrix

In [61]:
confusion_matrix(y_test, predictions)

array([[166,  83],
       [ 45, 198]])

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_test, predictions)

0.7398373983739838