# Machine Learning Project
# Kansas City Crime Data Deep Dive

### Import Dependancies

In [1]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(1)
import pandas as pd
from pandas import datetime
from dateutil import parser
from scipy import stats
import glob
import os

### Obtain KCMO crime data (Raw data came from data.kcmo.org)

In [2]:
kcmo_crime_2017 = os.path.join('Resources', 'KCPD_Crime_Data_2017.csv')
kcmo_crime_2016 = os.path.join('Resources', 'KCPD_Crime_Data_2016.csv')
kcmo_crime_2015 = os.path.join('Resources', 'KCPD_Crime_Data_2015.csv')
kcmo_crime_2014 = os.path.join('Resources', 'KCPD_Crime_Data_2014.csv')
kcmo_crime_2013 = os.path.join('Resources', 'KCPD_Crime_Data_2013.csv')
kcmo_crime_2012 = os.path.join('Resources', 'KCPD_Crime_Data_2012.csv')
kcmo_crime_2011 = os.path.join('Resources', 'KCPD_Crime_Data_2011.csv')
kcmo_crime_2010 = os.path.join('Resources', 'KCPD_Crime_Data_2010.csv')
kcmo_crime_2009 = os.path.join('Resources', 'KCPD_Crime_Data_2009.csv')

crime_2017 = pd.read_csv(kcmo_crime_2017)
crime_2016 = pd.read_csv(kcmo_crime_2016)
crime_2015 = pd.read_csv(kcmo_crime_2015)
crime_2014 = pd.read_csv(kcmo_crime_2014)
crime_2013 = pd.read_csv(kcmo_crime_2013)
crime_2012 = pd.read_csv(kcmo_crime_2012)
crime_2011 = pd.read_csv(kcmo_crime_2011)
crime_2010 = pd.read_csv(kcmo_crime_2010)
crime_2009 = pd.read_csv(kcmo_crime_2009)

In [3]:
path = r'C:\DataAnalytics\003_Project_Three\Resources'
all_files = glob.glob(os.path.join(path, '*.csv'))

next_df = (pd.read_csv(f) for f in all_files)

combined = pd.concat(next_df, ignore_index=True)
combined.head()

Unnamed: 0,Address,Age,Area,Beat,City,DVFlag,Description,Firearm Used Flag,Firearm Used Flag.1,From Time,...,Rep_Dist,Report_No,Reported Time,Reported_Date,Reported_Time,Sex,To Time,To_Date,To_Time,Zip Code
0,PROSPECT AV/VICTOR ST,28.0,EPD,333,KANSAS CITY,U,Possession of Drug E,,N,22:56,...,PJ2327,70059279,3:24,10/06/2009,,F,,10/05/2009 12:00:00 AM,23:10,64128.0
1,1100 LOCUST ST,27.0,CPD,112,KANSAS CITY,U,Justifiable Homicide,,Y,12:00,...,PJ1029,80005443,11:45,02/05/2009,,M,,,,64106.0
2,100 WHEELING AV,22.0,EPD,315,KANSAS CITY,U,Possession/Sale/Dist,,N,21:15,...,PJ0371,80019629,22:50,06/18/2009,,M,,,,64129.0
3,E 44 ST/INDIANA AV,,EPD,334,KANSAS CITY,U,Stolen Property OFFE,,N,18:44,...,PJ3525,70060962,18:44,01/28/2009,,U,,,,64130.0
4,1100 LOCUST ST,28.0,CPD,112,KANSAS CITY,U,Justifiable Homicide,,Y,12:00,...,PJ1029,80005443,11:45,02/05/2009,,M,,,,64106.0


In [4]:
combined.shape

(1160590, 31)

In [5]:
combined.rename(columns={'Zip Code':'zip_code'}, inplace=True)

### Combine CSV Files

In [6]:
combined_a = combined.drop(['Report_No', 'To_Date', 'From_Date', 'To_Time', 'From_Time',
                          'Offense', 'IBRS', 'Rep_Dist', 'Area', 'Beat', 'DVFlag', 'Address'], axis=1)

In [7]:
combined_a.shape

(1160590, 19)

In [12]:
# kc_crime_clean_zips = combined_a[combined_a.zip_code != 99999]
# only_kc_crime = kc_crime_clean_zips[kc_crime_clean_zips.City.str.contains("KANSAS CITY") == True]

# victims = only_kc_crime[only_kc_crime.Involvement.str.contains("VIC") == True]

# victims_no_nans = victims[victims.Sex.str.contains("NaN") == False]
# male_female_victims_kcmo = victims_no_nans[victims_no_nans.Sex.str.contains("U") == False]

kc_crime_real_ages_V2 = male_female_victims_kcmo[male_female_victims_kcmo['Age'] < 91]

In [13]:
kc_crime_real_ages_V2.shape

(355200, 19)

In [14]:
kc_crime_real_ages_V2.to_csv('kc_crime_for_visualizations_V2.csv', index=False)

In [None]:
combined_csv = pd.concat( [ pd.read_csv(f) for f in filenames ] )

In [None]:
combined_csv = pd.concat( [ pd.read_csv(c) for c in filenames ] )

### Change time to datetime and extract hour

In [None]:
kc_crime['Reported_Hour'] = pd.to_datetime(kc_crime['Reported_Time'])

In [None]:
kc_crime["Reported_Hour"] = kc_crime["Reported_Hour"].dt.floor('h')

In [None]:
kc_crime['Test'] = kc_crime['Reported_Hour'].dt.hour

In [None]:
kc_crime.head()

### Rename & drop columns

In [None]:
kc_crime.rename(columns={'Zip Code':'zip_code'}, inplace=True)
kc_crime_dropped_columns = kc_crime.drop(['Reported_Hour', 'Reported_Time', 'To_Date', 'From_Date', 'To_Time', 'From_Time',
                          'Offense', 'IBRS', 'Rep_Dist', 'Area', 'Beat', 'Address'], axis=1)

### Filter for male and female victims age 90 and under in KCMO

In [None]:
kc_crime_clean_zips = kc_crime_dropped_columns[kc_crime_dropped_columns.zip_code != 99999]
only_kc_crime = kc_crime_clean_zips[kc_crime_clean_zips.City.str.contains("KANSAS CITY") == True]

victims = only_kc_crime[only_kc_crime.Involvement.str.contains("VIC") == True]

victims_no_nans = victims[victims.Sex.str.contains("NaN") == False]
male_female_victims_kcmo = victims_no_nans[victims_no_nans.Sex.str.contains("U") == False]

kc_crime_real_ages = male_female_victims_kcmo[male_female_victims_kcmo['Age'] < 91]

### Copy kc_crime_real_ages and separate "Location" into 3 columns

In [None]:
kc_crime_real_ages_copy = kc_crime_real_ages.copy()
kc_crime_real_ages_copy.head(1)

# Split out geo data

In [None]:
location_only = kc_crime_real_ages_copy['Location'].str[0:-1].str.split('\n', expand=True)
location_only.columns = ("address", "city_zip", "geo")
location_only.head(1)

### Parsing out latitude and longitude

In [None]:
location_only['geo'] = location_only['geo'].str[1:]
geo_split = location_only['geo'].str[0:].str.split(', ', expand=True)
geo_split.columns = ("Latitude", "Longitude")
geo_split.head(1)

In [None]:
kc_crime_real_ages_copy.count()

In [None]:
geo_split.count()

In [None]:
kcmo_crime_with_nans = pd.concat([kc_crime_real_ages_copy, geo_split], axis=1)
kcmo_crime_with_nans.head(1)

In [None]:
kcmo_crime_with_nans.count()

In [None]:
kcmo_crime_no_lat_nans = kcmo_crime_with_nans[kcmo_crime_with_nans.Latitude.str.contains("NaN") == False]
kcmo_crime_no_nans = kcmo_crime_no_lat_nans[kcmo_crime_no_lat_nans.Longitude.str.contains("NaN") == False]
kc_crime_close = kcmo_crime_no_nans.drop(['City', 'DVFlag', 'Invl_No', 'Involvement', 'Firearm Used Flag', 'Location'], axis=1)
kc_crime_close.head(1)

In [None]:
kc_crime_close.columns = ("Report", "Date", "Crime", "Zip", "Race", "Sex", "Age", "Hour", "Latitude", "Longitude")
kc_crime_close = kc_crime_close.reset_index(drop=True)
kc_crime_close.head()

In [None]:
kc_crime_close.count()

In [None]:
# plt.scatter(kc_crime_close.Age, kc_crime_close.Zip)
# plt.show()

In [None]:
# plt.scatter(kc_crime_close.Longitude, kc_crime_close.Latitude, c=kc_crime_close.Age, 
#            cmap='viridis_r', alpha = 0.8, s=2)
# plt.colorbar()
# plt.show()

In [None]:
x_axis = np.arange(len(kc_crime_close['Crime']))
tick_locations = [value+0.4 for value in x_axis]

In [None]:
plt.figure(figsize=(20,3))
plt.bar(kc_crime_close['Crime'], kc_crime_close["Age"], color='r', alpha=0.5, align="edge")
plt.xticks(rotation="vertical")

In [None]:
# Set x and y limits
plt.xlim(-0.25, len(x_axis))
plt.ylim(0, max(kc_crime_close["Age"])+5)

In [None]:
# Set a Title and labels
plt.title("Average Age Crime of Victims")
plt.xlabel("Crime")
plt.ylabel("Age")

In [None]:
# plt.tight_layout()
# plt.savefig("avg_state_rain.png")
plt.show()

### Confirming Clean Data in Excel

In [None]:
# kc_crime_close.to_csv('kc_crime_close.csv', index=False)

# Confirm above data is good before proceeding

### Apply get_dummies function

In [None]:
crime_columns = pd.get_dummies(kc_crime_close['Crime'],prefix = "Crime")
kc_crime_close = pd.concat([kc_crime_close, crime_columns], axis=1)
kc_crime_close.drop('Crime', axis=1, inplace=True)
zip_columns = pd.get_dummies(kc_crime_close['Zip'],prefix = "Zip")
kc_crime_close = pd.concat([kc_crime_close, zip_columns], axis=1)
kc_crime_close.drop('Zip', axis=1, inplace=True)
race_columns = pd.get_dummies(kc_crime_close['Race'],prefix = "Race")
kc_crime_close = pd.concat([kc_crime_close, race_columns], axis=1)
kc_crime_close.drop('Race', axis=1, inplace=True)
sex_columns = pd.get_dummies(kc_crime_close['Sex'],prefix = "Sex")
kc_crime_close = pd.concat([kc_crime_close, sex_columns], axis=1)
kc_crime_close.drop('Sex', axis=1, inplace=True)
age_columns = pd.get_dummies(kc_crime_close['Age'],prefix = "Age")
kc_crime_close = pd.concat([kc_crime_close, age_columns], axis=1)
kc_crime_close.drop('Age', axis=1, inplace=True)
date_columns = pd.get_dummies(kc_crime_close['Date'],prefix = "Date")
kc_crime_close = pd.concat([kc_crime_close, date_columns], axis=1)
kc_crime_close.drop('Date', axis=1, inplace=True)
hour_columns = pd.get_dummies(kc_crime_close['Hour'],prefix = "Hour")
kc_crime_close = pd.concat([kc_crime_close, hour_columns], axis=1)
kc_crime_close.drop('Hour', axis=1, inplace=True)

In [None]:
kc_crime_close.head()