# Machine Learning Project
# Kansas City Crime Data Deep Dive

### Import Dependancies

In [1]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
import pandas as pd
from pandas import datetime
from dateutil import parser
from scipy import stats
import os

### Obtain KCMO crime data (Raw data came from data.kcmo.org)

In [2]:
file_name = os.path.join('Resources', 'KCPD_Crime_Data_2017.csv')
kc_crime = pd.read_csv(file_name)
kc_crime.reset_index()
kc_crime.head()

Unnamed: 0,Report_No,Reported_Date,Reported_Time,From_Date,From_Time,To_Date,To_Time,Offense,IBRS,Description,...,Rep_Dist,Area,DVFlag,Invl_No,Involvement,Race,Sex,Age,Firearm Used Flag,Location
0,100080848,5/28/2017,3:44,5/28/2017,2:00,,,1352,280,Stolen Property OFFE,...,PJ3255,CPD,U,1,VIC,W,F,29.0,N,"4000 MILL ST\nKANSAS CITY 64111\n(39.053635, -..."
1,120046817,11/21/2017,13:30,11/20/2017,9:00,,,101,09A,HOMICIDE/Non Neglige,...,PJ1029,CPD,U,1,SUS,B,M,,Y,"1100 LOCUST ST\nKANSAS CITY 64105\n(39.10091, ..."
2,120046817,11/21/2017,13:30,11/20/2017,9:00,,,101,09A,HOMICIDE/Non Neglige,...,PJ1029,CPD,N,1,VIC,B,F,69.0,Y,"1100 LOCUST ST\nKANSAS CITY 64105\n(39.10091, ..."
3,120085080,4/27/2017,11:12,,,,,201,11A,Rape,...,,,U,1,VIC,B,F,21.0,N,99999\n
4,120085080,4/27/2017,11:12,,,,,201,11A,Rape,...,,,U,1,SUS,B,M,52.0,N,99999\n


### Features in dataset

* Age

### Change time to datetime and extract hour

In [3]:
kc_crime.Reported_Date = pd.to_datetime(kc_crime.Reported_Date)
kc_crime.Reported_Time = pd.to_datetime(kc_crime.Reported_Time)
kc_crime["Reported_Time"] = kc_crime["Reported_Time"].dt.floor('h')
kc_crime['Reported_Time'] = kc_crime['Reported_Time'].dt.hour

In [4]:
kc_crime.dtypes

Report_No                     int64
Reported_Date        datetime64[ns]
Reported_Time                 int64
From_Date                    object
From_Time                    object
To_Date                      object
To_Time                      object
Offense                       int64
IBRS                         object
Description                  object
Beat                        float64
Address                      object
City                         object
Zip Code                      int64
Rep_Dist                     object
Area                         object
DVFlag                       object
Invl_No                       int64
Involvement                  object
Race                         object
Sex                          object
Age                         float64
Firearm Used Flag            object
Location                     object
dtype: object

In [5]:
kc_crime.head()

Unnamed: 0,Report_No,Reported_Date,Reported_Time,From_Date,From_Time,To_Date,To_Time,Offense,IBRS,Description,...,Rep_Dist,Area,DVFlag,Invl_No,Involvement,Race,Sex,Age,Firearm Used Flag,Location
0,100080848,2017-05-28,3,5/28/2017,2:00,,,1352,280,Stolen Property OFFE,...,PJ3255,CPD,U,1,VIC,W,F,29.0,N,"4000 MILL ST\nKANSAS CITY 64111\n(39.053635, -..."
1,120046817,2017-11-21,13,11/20/2017,9:00,,,101,09A,HOMICIDE/Non Neglige,...,PJ1029,CPD,U,1,SUS,B,M,,Y,"1100 LOCUST ST\nKANSAS CITY 64105\n(39.10091, ..."
2,120046817,2017-11-21,13,11/20/2017,9:00,,,101,09A,HOMICIDE/Non Neglige,...,PJ1029,CPD,N,1,VIC,B,F,69.0,Y,"1100 LOCUST ST\nKANSAS CITY 64105\n(39.10091, ..."
3,120085080,2017-04-27,11,,,,,201,11A,Rape,...,,,U,1,VIC,B,F,21.0,N,99999\n
4,120085080,2017-04-27,11,,,,,201,11A,Rape,...,,,U,1,SUS,B,M,52.0,N,99999\n


### Rename & drop columns

In [6]:
kc_crime.rename(columns={'Zip Code':'zip_code'}, inplace=True)
kc_crime_dropped_columns = kc_crime.drop(['Report_No', 'To_Date', 'From_Date', 'To_Time', 'From_Time',
                          'Offense', 'IBRS', 'Rep_Dist', 'Area', 'Beat', 'DVFlag', 'Address'], axis=1)

### Filter for male and female victims age 90 and under in KCMO

In [7]:
kc_crime_clean_zips = kc_crime_dropped_columns[kc_crime_dropped_columns.zip_code != 99999]
only_kc_crime = kc_crime_clean_zips[kc_crime_clean_zips.City.str.contains("KANSAS CITY") == True]

victims = only_kc_crime[only_kc_crime.Involvement.str.contains("VIC") == True]

victims_no_nans = victims[victims.Sex.str.contains("NaN") == False]
male_female_victims_kcmo = victims_no_nans[victims_no_nans.Sex.str.contains("U") == False]

kc_crime_real_ages = male_female_victims_kcmo[male_female_victims_kcmo['Age'] < 91]

### Copy kc_crime_real_ages and separate "Location" into 3 columns

In [8]:
kc_crime_real_ages_copy = kc_crime_real_ages.copy()
kc_crime_real_ages_copy.head(1)

Unnamed: 0,Reported_Date,Reported_Time,Description,City,zip_code,Invl_No,Involvement,Race,Sex,Age,Firearm Used Flag,Location
0,2017-05-28,3,Stolen Property OFFE,KANSAS CITY,64111,1,VIC,W,F,29.0,N,"4000 MILL ST\nKANSAS CITY 64111\n(39.053635, -..."


# Split out geo data

In [9]:
location_only = kc_crime_real_ages_copy['Location'].str[0:-1].str.split('\n', expand=True)
location_only.columns = ("address", "city_zip", "geo")
location_only.head(1)

Unnamed: 0,address,city_zip,geo
0,4000 MILL ST,KANSAS CITY 64111,"(39.053635, -94.595998"


### Parsing out latitude and longitude

In [10]:
location_only['geo'] = location_only['geo'].str[1:]
geo_split = location_only['geo'].str[0:].str.split(', ', expand=True)
geo_split.columns = ("Latitude", "Longitude")
geo_split.head(1)

Unnamed: 0,Latitude,Longitude
0,39.053635,-94.595998


In [11]:
kc_crime_real_ages_copy.count()

Reported_Date        43032
Reported_Time        43032
Description          43032
City                 43032
zip_code             43032
Invl_No              43032
Involvement          43032
Race                 43032
Sex                  43032
Age                  43032
Firearm Used Flag    43032
Location             43032
dtype: int64

In [12]:
geo_split.count()

Latitude     32283
Longitude    32283
dtype: int64

In [13]:
kcmo_crime_with_nans = pd.concat([kc_crime_real_ages_copy, geo_split], axis=1)
kcmo_crime_with_nans.head(1)

Unnamed: 0,Reported_Date,Reported_Time,Description,City,zip_code,Invl_No,Involvement,Race,Sex,Age,Firearm Used Flag,Location,Latitude,Longitude
0,2017-05-28,3,Stolen Property OFFE,KANSAS CITY,64111,1,VIC,W,F,29.0,N,"4000 MILL ST\nKANSAS CITY 64111\n(39.053635, -...",39.053635,-94.595998


In [14]:
kcmo_crime_with_nans.count()

Reported_Date        43032
Reported_Time        43032
Description          43032
City                 43032
zip_code             43032
Invl_No              43032
Involvement          43032
Race                 43032
Sex                  43032
Age                  43032
Firearm Used Flag    43032
Location             43032
Latitude             32283
Longitude            32283
dtype: int64

In [15]:
kcmo_crime_no_lat_nans = kcmo_crime_with_nans[kcmo_crime_with_nans.Latitude.str.contains("NaN") == False]
kcmo_crime_no_nans = kcmo_crime_no_lat_nans[kcmo_crime_no_lat_nans.Longitude.str.contains("NaN") == False]
kc_crime_close = kcmo_crime_no_nans.drop(['City', 'Invl_No', 'Involvement', 'Location'], axis=1)
kc_crime_close.head(1)

Unnamed: 0,Reported_Date,Reported_Time,Description,zip_code,Race,Sex,Age,Firearm Used Flag,Latitude,Longitude
0,2017-05-28,3,Stolen Property OFFE,64111,W,F,29.0,N,39.053635,-94.595998


In [16]:
kc_crime_close.columns = ( "Date", "Hour", "Crime", "Zip", "Race", "Sex", "Age", "Firearm", "Latitude", "Longitude")
kc_crime_close = kc_crime_close.reset_index(drop=True)
kc_crime_close.head()

Unnamed: 0,Date,Hour,Crime,Zip,Race,Sex,Age,Firearm,Latitude,Longitude
0,2017-05-28,3,Stolen Property OFFE,64111,W,F,29.0,N,39.053635,-94.595998
1,2017-11-21,13,HOMICIDE/Non Neglige,64105,B,F,69.0,Y,39.10091,-94.577328
2,2017-08-02,13,Auto Theft,64119,W,F,31.0,N,39.17744,-94.572069
3,2017-08-24,16,Intimidation,64130,W,F,19.0,N,39.033505,-94.547812
4,2017-03-15,14,Auto Theft,64157,W,F,62.0,N,39.235881,-94.466171


In [17]:
kc_crime_close.count()

Date         32283
Hour         32283
Crime        32283
Zip          32283
Race         32283
Sex          32283
Age          32283
Firearm      32283
Latitude     32283
Longitude    32283
dtype: int64

### Confirming Clean Data in Excel

In [58]:
kc_crime_close.to_csv('kc_crime_close.csv', index=False)

# Confirm above data is good before proceeding

### Apply get_dummies function

In [24]:
crime_columns = pd.get_dummies(kc_crime_close['Crime'],prefix = "Crime")
kc_crime_close = pd.concat([kc_crime_close, crime_columns], axis=1)
kc_crime_close.drop('Crime', axis=1, inplace=True)
zip_columns = pd.get_dummies(kc_crime_close['Zip'],prefix = "Zip")
kc_crime_close = pd.concat([kc_crime_close, zip_columns], axis=1)
kc_crime_close.drop('Zip', axis=1, inplace=True)
race_columns = pd.get_dummies(kc_crime_close['Race'],prefix = "Race")
kc_crime_close = pd.concat([kc_crime_close, race_columns], axis=1)
kc_crime_close.drop('Race', axis=1, inplace=True)
sex_columns = pd.get_dummies(kc_crime_close['Sex'],prefix = "Sex")
kc_crime_close = pd.concat([kc_crime_close, sex_columns], axis=1)
kc_crime_close.drop('Sex', axis=1, inplace=True)
age_columns = pd.get_dummies(kc_crime_close['Age'],prefix = "Age")
kc_crime_close = pd.concat([kc_crime_close, age_columns], axis=1)
kc_crime_close.drop('Age', axis=1, inplace=True)
date_columns = pd.get_dummies(kc_crime_close['Date'],prefix = "Date")
kc_crime_close = pd.concat([kc_crime_close, date_columns], axis=1)
kc_crime_close.drop('Date', axis=1, inplace=True)
hour_columns = pd.get_dummies(kc_crime_close['Hour'],prefix = "Hour")
kc_crime_close = pd.concat([kc_crime_close, hour_columns], axis=1)
kc_crime_close.drop('Hour', axis=1, inplace=True)
firearm_columns = pd.get_dummies(kc_crime_close['Firearm'],prefix = "Firearm")
kc_crime_close = pd.concat([kc_crime_close, firearm_columns], axis=1)
kc_crime_close.drop('Firearm', axis=1, inplace=True)

In [25]:
kc_crime_close.head()

Unnamed: 0,Latitude,Longitude,Crime_Agg Assault - Domest,Crime_Agg Assault - Drive-,Crime_Aggravated Assault,Crime_Aggravated Assault (,Crime_Armed Robbery,Crime_Arson,Crime_Arson with Fire Bomb,Crime_Attempt Suicide by C,...,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Firearm_N,Firearm_Y
0,39.053635,-94.595998,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,39.10091,-94.577328,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,39.17744,-94.572069,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,39.033505,-94.547812,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
4,39.235881,-94.466171,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
