In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

Obtain data from file "Crime_data_from_2010_to_present.csv"

In [3]:
crime = pd.read_csv("Crime_data_from_2010_to_present.csv")

In [4]:
crime.head(5)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
0,809,01/17/2015,01/16/2015,2130,9,Van Nuys,938,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,6200 FULTON AV,,"(34.1814, -118.4263)"
1,141801696,01/01/2015,01/01/2015,230,18,Southeast,1823,626,INTIMATE PARTNER - SIMPLE ASSAULT,2000 0444 0429 0416 1276,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,626.0,,,,200 E 97TH ST,,"(33.9483, -118.2717)"
2,141816266,11/05/2015,10/01/2015,800,12,77th Street,1249,805,PIMPING,1402 0908 0913,...,,AA,Adult Arrest,805.0,998.0,,,FIGUEROA ST,65TH ST,"(33.9807, -118.2827)"
3,150100503,01/01/2015,01/01/2015,40,1,Central,111,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,,IC,Invest Cont,745.0,998.0,,,HILL ST,CESAR E CHAVEZ,"(34.0591, -118.2412)"
4,150100506,01/01/2015,01/01/2015,240,1,Central,162,626,INTIMATE PARTNER - SIMPLE ASSAULT,2000 1243 0416 1251 0400 0444,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,626.0,,,,500 W 7TH ST,,"(34.0467, -118.2556)"


Drop out the columns that we do not need for our analysis

In [5]:

crime = crime.drop(columns = ['DR Number', 'Date Reported', 'Area ID', 'Reporting District', 'Crime Code', 'Premise Code', 'Premise Description', 'Weapon Used Code', 'MO Codes', 'Weapon Description', 'Status Code', 'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3', 'Crime Code 4', 'Cross Street', 'Address', 'Location '])
# shape = (1910587, 7)

Move target column to the front

In [6]:

crime = crime.reindex(['Crime Code Description', 'Date Occurred', 'Time Occurred', 'Area Name', 'Victim Age', 'Victim Sex', 'Victim Descent'], axis = 'columns')

add years to string 

In [9]:

def combine_year(string):
    if '2010' in string:
        string = '2010'
    elif '2011' in string:
        string = '2011'
    elif '2012' in string:
        string = '2012'
    elif '2013' in string:
        string = '2013'
    elif '2014' in string:
        string = '2014'
    elif '2015' in string:
        string = '2015'
    elif '2016' in string:
        string = '2016'
    elif '2017' in string:
        string = '2017'
    elif '2018' in string:
        string = '2018'
    elif '2019' in string:
        string = '2019'
    return string

apply the function we created above to Date Occurred

In [3]:
crime['Date Occurred'] = crime['Date Occurred'].apply(combine_year)

NameError: name 'crime' is not defined

extract each individual type out of each category 

In [11]:

types = crime['Crime Code Description'].unique()
dates = crime['Date Occurred'].unique()
areas = crime['Area Name'].unique()
ages = crime['Victim Age'].unique()
genders = crime['Victim Sex'].unique()
descents = crime['Victim Descent'].unique()
print('Crimes types:\n', types, '\nDate Occurred:\n', dates,'\nAreas:\n', areas, '\nGenders:\n', genders, '\nDescents:\n', descents)

Crimes types:
 ['VEHICLE - STOLEN' 'INTIMATE PARTNER - SIMPLE ASSAULT' 'PIMPING'
 'VANDALISM - MISDEAMEANOR ($399 OR UNDER)'
 'INTIMATE PARTNER - AGGRAVATED ASSAULT'
 'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)'
 'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD'
 'SHOPLIFTING - PETTY THEFT ($950 & UNDER)' 'BATTERY WITH SEXUAL CONTACT'
 'RAPE, FORCIBLE' 'ORAL COPULATION'
 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT' 'ROBBERY'
 'BATTERY - SIMPLE ASSAULT' 'KIDNAPPING' 'CHILD NEGLECT (SEE 300 W.I.C.)'
 'CRIMINAL THREATS - NO WEAPON DISPLAYED' 'BRANDISH WEAPON'
 'PURSE SNATCHING' 'BURGLARY' 'BURGLARY FROM VEHICLE, ATTEMPTED'
 'OTHER ASSAULT' 'SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH'
 'BURGLARY FROM VEHICLE' 'BURGLARY, ATTEMPTED' 'TRESPASSING'
 'ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER' 'CONSPIRACY'
 'BATTERY POLICE (SIMPLE)' 'INDECENT EXPOSURE' 'OTHER MISCELLANEOUS CRIME'
 'BIKE - STOLEN' 'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)'
 'THEFT PLAIN 

drop out the missing and unnessary information

In [12]:

crime.dropna(inplace = True)
crime.drop(crime[(crime['Victim Sex'] == 'X') | (crime['Victim Sex'] == 'H') | (crime['Victim Sex'] == '-') | (crime['Victim Sex'] == 'N')].index, inplace = True)
crime.drop(crime[(crime['Victim Descent'] == '-')].index, inplace = True)
# shape = (1340175, 7)

In [13]:
print(crime.iloc[0])

Crime Code Description    INTIMATE PARTNER - SIMPLE ASSAULT
Date Occurred                                          2015
Time Occurred                                           230
Area Name                                         Southeast
Victim Age                                               40
Victim Sex                                                F
Victim Descent                                            B
Name: 1, dtype: object


convert each string into a unique integer by using the sklearn library

In [14]:

le = preprocessing.LabelEncoder()
crime['Victim Sex'] = le.fit_transform(crime['Victim Sex'])
crime['Area Name'] = le.fit_transform(crime['Area Name'])
crime.iloc[:,0] = le.fit_transform(crime.iloc[:,0])
crime.iloc[:,-1] = le.fit_transform(crime.iloc[:,-1])
print(crime.head(5))

   Crime Code Description Date Occurred  Time Occurred  Area Name  Victim Age  \
1                      73          2015            230         14        40.0   
2                      90          2015            800          0        24.0   
3                     130          2015             40          1         0.0   
4                      73          2015            240          1        28.0   
5                      72          2015           1100          1        53.0   

   Victim Sex  Victim Descent  
1           0               1  
2           0               1  
3           1               6  
4           0              16  
5           0               1  


Filter the date that only include the years from 2015-2017 and convert the column to arrays.
Store the second row to last row of the table into X and store the first column to Y

In [15]:
# Training set
crime_before_2018 = crime.loc[(crime['Date Occurred'] >= '2015') & (crime['Date Occurred'] < '2018')]  # shape = ((429720, 7))

# print(crime_before_2018.shape)
crime_X = np.array(crime_before_2018.iloc[:,1:].copy())  # shape = (429720, 6)
crime_y = np.array(crime_before_2018.iloc[:,0].copy())  # shape = (429720,)

Extract the data for 2018 and convert them to arrary. and second row to last row of the table into X, and store the first row to crime_2018_x

In [16]:
# Testing set

crime_2018 = crime.loc[crime['Date Occurred'] == '2018']  # shape = (3461, 7)

# print(crime_2018.shape)
crime_2018_X = np.array(crime_2018.iloc[:,1:].copy())  # shape = (3461, 6)
crime_2018_y = np.array(crime_2018.iloc[:,0].copy())  # shape = (3461,)

split arrays to train and test subsets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(crime_X, crime_y, test_size=0.2, random_state=42)
crime_types = len(types)

Apply the KNeighborsClassifier to store the nearest neighbors to KNN_clf
Fit the model using X as training data and y as target values

In [None]:
#KNN

KNN_clf = KNeighborsClassifier(n_neighbors = crime_types)
KNN_clf.fit(X_train, y_train)

Calculate the prediction and accuracy 

In [17]:
train_pred_knn = KNN_clf.predict(X_train)
test_pred_knn = KNN_clf.predict(X_test)
knn_pred_2018 = KNN_clf.predict(crime_2018_X)

knn_train_acc = accuracy_score(y_train, train_pred_knn) * 100
knn_test_acc = accuracy_score(y_test, test_pred_knn) * 100

knn_acc = (knn_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %knn_train_acc)
print('Test Accuracy = %.2f' %knn_test_acc)

print('Prediction Accurary for 2018 = %.2f' %knn_acc)

Train Accuracy = 19.14
Test Accuracy = 17.55
Prediction Accurary for 2018 = 15.02


Use the K-means algorithm and Fit the model using X as training data and y as target values

In [None]:
#K-Means

kmeans_clf = KMeans(n_clusters = crime_types, random_state=0)
kmeans_clf.fit(X_train, y_train)

Calculate the prediction and accuracy 

In [None]:
train_pred_km = kmeans_clf.predict(X_train)
test_pred_km = kmeans_clf.predict(X_test)
km_pred_2018 = Km_clf.predict(crime_2018_X)

km_train_acc = accuracy_score(y_train, train_pred_km) * 100
km_test_acc = accuracy_score(y_test, test_pred_km) * 100

km_acc = (km_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %km_train_acc)
print('Test Accuracy = %.2f' %km_test_acc)

print('Prediction Accurary for 2018 = %.2f' %km_acc)

Use Linear Discriminant method to analyze our data
Fit the model using X as training data and y as target values

In [18]:
lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, y_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

Calculate the prediction and accuracy 

In [19]:
train_pred_lda = lda_clf.predict(X_train)
test_pred_lda = lda_clf.predict(X_test)
lda_pred_2018 = lda_clf.predict(crime_2018_X)

lda_train_acc = accuracy_score(y_train, train_pred_lda) * 100
lda_test_acc = accuracy_score(y_test, test_pred_lda) * 100

lda_acc = (lda_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %lda_train_acc)
print('Test Accuracy = %.2f' %lda_test_acc)

print('Prediction Accurary for 2018 = %.2f' %lda_acc)

Train Accuracy = 13.36
Test Accuracy = 13.36
Prediction Accurary for 2018 = 10.52
