# Crime data in Chicago (January 1, 2010 to December 30, 2019)

In [108]:
import pandas as pd
crime_data = pd.read_csv('2019_data.csv')
crime_data.dtypes

ID                        int64
Case Number              object
Date                     object
Block                    object
IUCR                     object
Primary Type             object
Description              object
Location Description     object
Arrest                     bool
Domestic                   bool
Beat                      int64
District                  int64
Ward                    float64
Community Area            int64
FBI Code                 object
X Coordinate            float64
Y Coordinate            float64
Year                      int64
Updated On               object
Latitude                float64
Longitude               float64
Location                 object
dtype: object

In [109]:
import datetime

cols_to_remove = list(crime_data)

# Separating the date column into month, day, year, and time columns
Months = []
Days   = []
Years  = []
Hours  = []

for row in crime_data['Date']:
    d = datetime.datetime.strptime(row, '%m/%d/%Y %I:%M:%S %p')
    Months.append(d.month)
    Days.append(d.day)
    Years.append(d.year)
    Hours.append(d.hour)
    
crime_data['Month'] = Months
crime_data['Day'] = Days
crime_data['Year'] = Years
crime_data['Hour'] = Hours

crime_data[-4:]

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Month,Day,Hour
260461,11938394,JD100460,01/01/2020 12:00:00 AM,040XX W 21ST ST,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,...,1149734.0,1889784.0,2020,01/08/2020 03:49:48 PM,41.853481,-87.72591,"(41.853481339, -87.72590985)",1,1,0
260462,11939851,JD101299,01/01/2020 12:00:00 AM,0000X W Kinzie St,820,THEFT,$500 AND UNDER,BAR OR TAVERN,False,False,...,,,2020,01/08/2020 03:49:48 PM,,,,1,1,0
260463,11938225,JD100068,01/01/2020 12:00:00 AM,071XX S WOODLAWN AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,...,1185445.0,1858029.0,2020,01/08/2020 03:49:48 PM,41.765575,-87.595839,"(41.765575348, -87.595839229)",1,1,0
260464,11938144,JD100020,01/01/2020 12:00:00 AM,013XX S KILBOURN AVE,550,ASSAULT,AGGRAVATED PO: HANDGUN,VACANT LOT/LAND,False,False,...,1146598.0,1893486.0,2020,01/08/2020 03:49:48 PM,41.8637,-87.737326,"(41.863700397, -87.737325884)",1,1,0


In [110]:
# 1-hot encoding for: Arrest, Location Description, Domestic, District, Primary Type, Community Area
discrete_variables = "Arrest, Location Description, Domestic, District, Primary Type, Community Area".split(", ")
print(discrete_variables)
for variable in discrete_variables:
    for dtype in crime_data[variable].dropna().unique():
        if (dtype != "nan"):
            crime_data[variable+"_"+str(dtype)]  = 1*(crime_data[variable] == dtype)

crime_data = crime_data.drop(columns=cols_to_remove)
crime_data = crime_data.drop(columns=['Arrest_False','Domestic_False'])
crime_data[-4:]

['Arrest', 'Location Description', 'Domestic', 'District', 'Primary Type', 'Community Area']


Unnamed: 0,Month,Day,Hour,Arrest_True,Location Description_APARTMENT,Location Description_RESIDENCE,Location Description_STREET,Location Description_RESIDENCE PORCH/HALLWAY,"Location Description_SCHOOL, PRIVATE, BUILDING",Location Description_OTHER,...,Community Area_34,Community Area_11,Community Area_52,Community Area_74,Community Area_59,Community Area_17,Community Area_10,Community Area_9,Community Area_37,Community Area_12
260461,1,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260462,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260463,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260464,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
crime_data.std()

Month                             3.337527
Day                               8.804065
Hour                              6.542763
Arrest_True                       0.410754
Location Description_APARTMENT    0.340317
                                    ...   
Community Area_17                 0.071382
Community Area_10                 0.066443
Community Area_9                  0.032060
Community Area_37                 0.058908
Community Area_12                 0.042033
Length: 292, dtype: float64

In [112]:
#scale all variables to be between 0 and 1
#iterate through all columns
for c in crime_data.columns:
    crime_data[c] = (crime_data[c]-crime_data[c].min())/(crime_data[c].max() - crime_data[c].min())

crime_data.std().sort_values()

Location Description_BASEMENT             0.001959
Location Description_CHA PLAY LOT         0.001959
Location Description_GANGWAY              0.001959
Location Description_CHA GROUNDS          0.001959
Location Description_RAILROAD PROPERTY    0.001959
                                            ...   
Domestic_True                             0.371798
Primary Type_BATTERY                      0.392325
Arrest_True                               0.410754
Location Description_STREET               0.412093
Primary Type_THEFT                        0.426919
Length: 292, dtype: float64

In [113]:
import numpy as np

X = crime_data.to_numpy()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [114]:
import sklearn
from sklearn.decomposition import PCA

pca = PCA(n_components=5) #find 2 principal components
fit = pca.fit(X)
print(("Explained Variance: %s") % (fit.explained_variance_ratio_))

pc1, pc2, pc3, pc4, p5 = pca.components_

weights1 = list(zip(pc1, crime_data.columns))
weights1.sort()

print('Principal Component 1: \n (Negative)',weights1[:5], '\n\n (Positive)' ,weights1[-5:])

Explained Variance: [0.06774662 0.05467406 0.04183173 0.03962656 0.03309242]
Principal Component 1: 
 (Negative) [(-0.5230467855403054, 'Primary Type_THEFT'), (-0.2666622441876718, 'Location Description_STREET'), (-0.07732582031538779, 'District_1'), (-0.07499703695121993, 'District_18'), (-0.06253591972575864, 'Community Area_32')] 

 (Positive) [(0.04814195810972119, 'Primary Type_OTHER OFFENSE'), (0.23241333705458714, 'Location Description_APARTMENT'), (0.2597990657646565, 'Location Description_RESIDENCE'), (0.4843382516240165, 'Domestic_True'), (0.4977374441946279, 'Primary Type_BATTERY')]


In [116]:
from sklearn.model_selection import train_test_split

print(crime_data.groupby(['Arrest_True'])['Arrest_True'].count())
print("\n")
print(crime_data.groupby(['Month'])['Arrest_True'].mean())
print("\n")
print(crime_data.groupby(['Month'])['Month'].count())
print("\n")
print(crime_data.groupby(['Hour'])['Arrest_True'].mean())
print("\n")
print(crime_data.groupby(['Hour'])['Hour'].count())

Arrest_True
0.0    204491
1.0     55974
Name: Arrest_True, dtype: int64


Month
0.000000    0.228380
0.090909    0.227965
0.181818    0.233276
0.272727    0.219311
0.363636    0.216804
0.454545    0.208171
0.545455    0.211022
0.636364    0.213435
0.727273    0.208905
0.818182    0.202801
0.909091    0.213179
1.000000    0.200579
Name: Arrest_True, dtype: float64


Month
0.000000    19669
0.090909    18380
0.181818    20405
0.272727    20993
0.363636    23625
0.454545    23572
0.545455    24822
0.636364    24354
0.727273    22393
0.818182    21637
0.909091    19880
1.000000    20735
Name: Month, dtype: int64


Hour
0.000000    0.180005
0.043478    0.223159
0.086957    0.193823
0.130435    0.178669
0.173913    0.149989
0.217391    0.125261
0.260870    0.162010
0.304348    0.157147
0.347826    0.166908
0.391304    0.176747
0.434783    0.229402
0.478261    0.253354
0.521739    0.188303
0.565217    0.216763
0.608696    0.204922
0.652174    0.194444
0.695652    0.210291
0.739130    0.218560

In [121]:
#Model stuff will go here 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X1 = crime_data.drop(columns = ['Arrest_True'])
y = crime_data['Arrest_True']

X_train_full, X_test_full, y_train, y_test = train_test_split(X1, y, test_size=0.30, random_state=0)

def logmodel():
    X = X1.to_numpy() #defines the features
    Y = y.to_numpy() #labels (or what we are predicting)

    #create a testing dataset
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

    log = LogisticRegression(max_iter=210000) #model that we use
    log.fit(X_train,Y_train)
    
    Y_pred = log.predict(X_test) #evaluation and prediction
    
    print(classification_report(Y_test, Y_pred))
    
logmodel()

              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     40874
         1.0       0.84      0.46      0.60     11219

    accuracy                           0.87     52093
   macro avg       0.85      0.72      0.76     52093
weighted avg       0.86      0.87      0.85     52093



In [None]:
# more examples from class below ...

In [None]:
%matplotlib inline  

import matplotlib.pyplot as plt

Y = pca.transform(X)
plt.scatter(Y[:,0], Y[:,1], alpha=0.1)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

Let's try to interpret this figure a bit more. Let's see where all the "Baathist" Arab countries lie on this plot:

In [None]:
arab = ['Syria', 'Jordan', 'Kuwait', 'UAE']
arab_index = [list(flag.index.values).index(a) for a in arab]
plt.scatter(Y[:,0], Y[:,1])
plt.scatter(Y[arab_index,0], Y[arab_index,1],color='r')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

Or we can look at all of the countries in northern europe

In [None]:
ne = ['Norway', 'Denmark', 'Finland', 'Iceland']
ne_index = [list(flag.index.values).index(a) for a in ne]
plt.scatter(Y[:,0], Y[:,1])
plt.scatter(Y[ne_index,0], Y[ne_index,1],color='r')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

## Clustering
When we were playing around with the data we manually used data clusters that we roughly knew about to validate our model. How do we do this automatically? Clustering is the task of dividing the population or data points into a number of groups such that data points in the same groups are more similar to other data points in the same group than those in other groups. In particular, we will focus on a simple form of clustering called "k-means". k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. 

Again, the mathematical details are better left for a machine learning class but you can see how to use scikit-learn to do this:

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 15) #break the data into 15 clusters
kmeans.fit(X)

In [None]:
flag[kmeans.labels_ == 9]

In [None]:
flag[kmeans.labels_ == 6]