In [16]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split

Obtain data from file "Crime_data_from_2010_to_present.csv"

In [17]:
crime = pd.read_csv("Crime_data_from_2010_to_present.csv")
# crime.columns.values

In [18]:
crime.head(5)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
0,809,01/17/2015,01/16/2015,2130,9,Van Nuys,938,510,VEHICLE - STOLEN,,...,,IC,Invest Cont,510.0,,,,6200 FULTON AV,,"(34.1814, -118.4263)"
1,141801696,01/01/2015,01/01/2015,230,18,Southeast,1823,626,INTIMATE PARTNER - SIMPLE ASSAULT,2000 0444 0429 0416 1276,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,626.0,,,,200 E 97TH ST,,"(33.9483, -118.2717)"
2,141816266,11/05/2015,10/01/2015,800,12,77th Street,1249,805,PIMPING,1402 0908 0913,...,,AA,Adult Arrest,805.0,998.0,,,FIGUEROA ST,65TH ST,"(33.9807, -118.2827)"
3,150100503,01/01/2015,01/01/2015,40,1,Central,111,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,...,,IC,Invest Cont,745.0,998.0,,,HILL ST,CESAR E CHAVEZ,"(34.0591, -118.2412)"
4,150100506,01/01/2015,01/01/2015,240,1,Central,162,626,INTIMATE PARTNER - SIMPLE ASSAULT,2000 1243 0416 1251 0400 0444,...,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,626.0,,,,500 W 7TH ST,,"(34.0467, -118.2556)"


Drop out the columns that we do not need for our analysis

In [19]:
crime = crime.drop(columns = ['DR Number', 'Date Reported', 'Area Name', 'Reporting District', 'Crime Code Description', 'Premise Code', 'Premise Description', 'Weapon Used Code', 'MO Codes', 'Weapon Description', 'Status Code', 'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3', 'Crime Code 4', 'Cross Street', 'Address', 'Location '])
# shape = (1910587, 7)

Move target column to the front

In [20]:
crime = crime.reindex(['Crime Code', 'Date Occurred', 'Time Occurred', 'Area ID', 'Victim Age', 'Victim Sex', 'Victim Descent'], axis = 'columns')

Standardize 'Date Occurred' and 'Time Occurred' columns

In [21]:
def combine_year(string):
    if '2010' in string:
        string = '2010'
    elif '2011' in string:
        string = '2011'
    elif '2012' in string:
        string = '2012'
    elif '2013' in string:
        string = '2013'
    elif '2014' in string:
        string = '2014'
    elif '2015' in string:
        string = '2015'
    elif '2016' in string:
        string = '2016'
    elif '2017' in string:
        string = '2017'
    elif '2018' in string:
        string = '2018'
    elif '2019' in string:
        string = '2019'
    return int(string)

In [22]:
def hour(time):
    if time < 100:
        return 0
    elif time >= 100:
        return time // 100

extract each individual type out of each category

In [23]:
types = crime['Crime Code'].unique()
times = crime['Time Occurred'].unique()
dates = crime['Date Occurred'].unique()
areas = crime['Area ID'].unique()
ages = crime['Victim Age'].unique()
genders = crime['Victim Sex'].unique()
descents = crime['Victim Descent'].unique()
print('Crimes types:\n', types, '\nTimes:\n', times, '\nDate Occurred:\n', dates,'\nAreas:\n', areas, '\nAges:\n', ages, '\nGenders:\n', genders, '\nDescents:\n', descents)

Crimes types:
 [510 626 805 745 236 740 341 442 860 121 820 230 210 624 910 237 930 761
 351 310 410 625 821 330 320 888 231 944 623 850 946 480 343 440 420 627
 755 220 660 668 763 648 350 421 886 352 813 235 949 951 250 437 753 354
 762 331 943 664 110 649 901 662 956 433 900 647 903 950 434 922 666 932
 810 251 474 122 441 654 438 520 450 670 353 940 812 661 443 815 451 902
 622 928 756 485 439 653 920 924 452 884 444 652 865 890 933 651 870 345
 470 840 880 487 446 471 445 830 954 806 760 948 931 473 942 347 822 882
 435 814 436 113 349 475 921 432 926 952 472 906 453] 
Times:
 [2130  230  800 ...  604 2231  431] 
Date Occurred:
 ['01/16/2015' '01/01/2015' '10/01/2015' ... '10/30/2014' '11/22/2014'
 '12/08/2014'] 
Areas:
 [ 9 18 12  1 11 13 14 15 17 10 16 19 20 21  2  3  4  5  6  7  8] 
Ages:
 [114.  40.  24.   0.  28.  53.  60.  44.  39.  70.   1.  34.  25.  31.
  21.  22.  46.  41.  51.  38.  20.  33.  48.  29.  45.  49.   3.  19.
  18.  30.  59.  43.  63.  42.  50.  27.  17.  47

In [24]:
crime['Date Occurred'] = crime['Date Occurred'].apply(combine_year)
crime['Time Occurred'] = crime['Time Occurred'].apply(hour)

drop out the missing and unnessary information

In [25]:
crime.dropna(inplace = True)
crime.drop(crime[(crime['Victim Sex'] == 'X') | (crime['Victim Sex'] == 'H') | (crime['Victim Sex'] == '-') | (crime['Victim Sex'] == 'N')].index, inplace = True)
crime.drop(crime[(crime['Victim Descent'] == '-')].index, inplace = True)
crime.drop(crime[(crime['Victim Age'] < 0)].index, inplace = True)

print(crime.shape)
# shape = (1339982, 7)

(1339982, 7)


In [26]:
times = crime['Time Occurred'].unique()
dates = crime['Date Occurred'].unique()
areas = crime['Area ID'].unique()
ages = crime['Victim Age'].unique()
genders = crime['Victim Sex'].unique()
descents = crime['Victim Descent'].unique()
print('Times:\n', times, '\nDate Occurred:\n', dates,'\nAreas:\n', areas, '\nAges:\n', ages, '\nGenders:\n', genders, '\nDescents:\n', descents)

Times:
 [ 2  8  0 11 21 17 16 14 18  1  9 23 12 20 15 22  3 19  4  6 13 10  5  7] 
Date Occurred:
 [2015 2010 2013 2014 2016 2017 2018 2019 2012 2011] 
Areas:
 [18 12  1 11 13 14 15 17 10 16 19 20 21  9  2  3  4  5  6  7  8] 
Ages:
 [ 40.  24.   0.  28.  53.  60.  44.  39.  70.  34.  25.  31.  21.  22.
  46.  41.  51.  38.  20.  33.  48.  29.  45.  49.   3.  19.  18.  30.
  59.  43.  63.  42.  50.  27.  17.  47.  58.  23.  54.  15.  37.  10.
  14.  61.  64.  32.   1.  55.  69.  26.  57.  36.  16.  62.  67.  56.
  65.  52.   2.  13.  35.  86.   7.  66.  72.  71.  78.   5.  68.  73.
  76.  83.  82.  75. 114.  91.  11.  87.  85.  77.  74.  80.  90. 112.
  94.   9.  12.  79.  84.  93. 115.   8.   6.  81.  88.  89.  95.   4.
  92.  99. 100.  96.  97.  98. 109. 110. 116. 113. 111.] 
Genders:
 ['F' 'M'] 
Descents:
 ['B' 'H' 'W' 'A' 'O' 'X' 'K' 'C' 'I' 'F' 'J' 'P' 'G' 'Z' 'V' 'S' 'U' 'L'
 'D']


convert each string into a unique integer by using the sklearn library

In [105]:
le = preprocessing.LabelEncoder()
crime['Victim Sex'] = le.fit_transform(crime['Victim Sex'])
# crime['Area ID'] = le.fit_transform(crime['Area ID'])
# crime['Time Occurred'] = le.fit_transform(crime['Time Occurred'])
# crime.iloc[:,0] = le.fit_transform(crime.iloc[:,0])
crime.iloc[:,-1] = le.fit_transform(crime.iloc[:,-1])
print(crime.head(5))

   Crime Code Date Occurred  Time Occurred  Area ID  Victim Age  Victim Sex  \
1         626          2015              2       18        40.0           0   
2         805          2015              8       12        24.0           0   
3         745          2015              0        1         0.0           1   
4         626          2015              2        1        28.0           0   
5         236          2015             11        1        53.0           0   

   Victim Descent  
1               1  
2               1  
3               6  
4              16  
5               1  


In [106]:
#checking columns' datatype

dtypeCount =[crime.iloc[:,i].apply(type).value_counts() for i in range(crime.shape[1])]
print(dtypeCount)

[<class 'int'>    1339982
Name: Crime Code, dtype: int64, <class 'str'>    1339982
Name: Date Occurred, dtype: int64, <class 'int'>    1339982
Name: Time Occurred, dtype: int64, <class 'int'>    1339982
Name: Area ID, dtype: int64, <class 'float'>    1339982
Name: Victim Age, dtype: int64, <class 'int'>    1339982
Name: Victim Sex, dtype: int64, <class 'int'>    1339982
Name: Victim Descent, dtype: int64]


Filter the date that only include the years from 2015-2017 and convert the column to arrays. Store the second row to last row of the table into X and store the first column to Y

In [34]:
# Training set
crime_2015_2018 = crime.loc[(crime['Date Occurred'] >= 2015) & (crime['Date Occurred'] < 2018)]  # shape = ((429720, 7))
crime_2015_2018.pop('Date Occurred')

# print(crime_before_2018.shape)
crime_X = np.array(crime_2015_2018.iloc[:,1:].copy())  # shape = (429720, 5)
crime_y = np.array(crime_2015_2018.iloc[:,0].copy())  # shape = (429720,)

Extract the data for 2018 and convert them to arrary. and second row to last row of the table into X, and store the first row to crime_2018_x

In [35]:
# Testing set

crime_2018 = crime.loc[crime['Date Occurred'] == 2018]  # shape = (3461, 7)
crime_2018.pop('Date Occurred')


# print(crime_2018.shape)
crime_2018_X = np.array(crime_2018.iloc[:,1:].copy())  # shape = (3461, 5)
crime_2018_y = np.array(crime_2018.iloc[:,0].copy())  # shape = (3461,)

split arrays to train and test subsets

In [16]:
X_train, X_test, y_train, y_test = train_test_split(crime_X, crime_y, test_size=0.2, random_state=42)
crime_types = len(types)

Apply the KNeighborsClassifier to store the nearest neighbors to KNN_clf Fit the model using X as training data and y as target values

In [17]:
#KNN

KNN_clf = KNeighborsClassifier(n_neighbors = crime_types)
KNN_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=139, p=2,
           weights='uniform')

Calculate the prediction and accuracy

In [18]:
train_pred_knn = KNN_clf.predict(X_train)
test_pred_knn = KNN_clf.predict(X_test)
knn_pred_2018 = KNN_clf.predict(crime_2018_X)

knn_train_acc = accuracy_score(y_train, train_pred_knn) * 100
knn_test_acc = accuracy_score(y_test, test_pred_knn) * 100

knn_acc = (knn_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %knn_train_acc)
print('Test Accuracy = %.2f' %knn_test_acc)

print('Prediction Accurary for 2018 = %.2f' %knn_acc)

Train Accuracy = 18.17
Test Accuracy = 16.54
Prediction Accurary for 2018 = 14.07


Use the K-means algorithm and Fit the model using X as training data and y as target values

In [19]:
#K-Means

kmeans_clf = KMeans(n_clusters = crime_types, random_state=0)
kmeans_clf.fit(X_train, y_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=139, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

Calculate the prediction and accuracy

In [20]:
train_pred_km = kmeans_clf.predict(X_train)
test_pred_km = kmeans_clf.predict(X_test)
km_pred_2018 = kmeans_clf.predict(crime_2018_X)

km_train_acc = accuracy_score(y_train, train_pred_km) * 100
km_test_acc = accuracy_score(y_test, test_pred_km) * 100

km_acc = (km_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %km_train_acc)
print('Test Accuracy = %.2f' %km_test_acc)

print('Prediction Accurary for 2018 = %.2f' %km_acc)

Train Accuracy = 0.55
Test Accuracy = 0.56
Prediction Accurary for 2018 = 0.69


Use Linear Discriminant Analysis method to analyze our data Fit the model using X as training data and y as target values

In [21]:
#Linear Discriminant Analysis

lda_clf = LinearDiscriminantAnalysis()
lda_clf.fit(X_train, y_train)



LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

Calculate the prediction and accuracy

In [22]:
train_pred_lda = lda_clf.predict(X_train)
test_pred_lda = lda_clf.predict(X_test)
lda_pred_2018 = lda_clf.predict(crime_2018_X)

lda_train_acc = accuracy_score(y_train, train_pred_lda) * 100
lda_test_acc = accuracy_score(y_test, test_pred_lda) * 100

lda_acc = (lda_pred_2018 == crime_2018_y).sum() / crime_2018_y.shape[0] * 100

print('Train Accuracy = %.2f' %lda_train_acc)
print('Test Accuracy = %.2f' %lda_test_acc)

print('Prediction Accurary for 2018 = %.2f' %lda_acc)

Train Accuracy = 13.30
Test Accuracy = 13.26
Prediction Accurary for 2018 = 13.55
