In [1]:
import numpy as np
import pandas as pd
import patsy

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV


import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
crime_csv = '/content/sf_crime_train.csv'

In [5]:
#read in the data using pandas
sf_crime = pd.read_csv(crime_csv)
sf_crime.drop('DayOfWeek',axis=1,inplace=True)
sf_crime.head()

Unnamed: 0,Dates,Category,Descript,PdDistrict,Resolution,Address,X,Y
0,5/13/15 23:53,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,5/13/15 23:53,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,5/13/15 23:33,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,5/13/15 23:30,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [6]:
# check the shape of your dataframe
sf_crime.shape

(18000, 8)

In [7]:
#check whether there are any missing values
sf_crime.isnull().sum()

Dates         0
Category      0
Descript      0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

In [9]:
sf_crime.dtypes

Dates          object
Category       object
Descript       object
PdDistrict     object
Resolution     object
Address        object
X             float64
Y             float64
dtype: object

In [11]:
sf_crime['Dates'] =  pd.to_datetime(sf_crime['Dates'])
sf_crime['Dates'] 

0       2015-05-13 23:53:00
1       2015-05-13 23:53:00
2       2015-05-13 23:33:00
3       2015-05-13 23:30:00
4       2015-05-13 23:30:00
                ...        
17995   2015-02-17 20:00:00
17996   2015-02-17 20:00:00
17997   2015-02-17 20:00:00
17998   2015-02-17 20:00:00
17999   2015-02-17 20:00:00
Name: Dates, Length: 18000, dtype: datetime64[ns]

In [12]:
# create a new column for 'Year','Month',and 'Day_of_Week'
sf_crime['Year'] = sf_crime['Dates'].dt.year
sf_crime['Month'] = sf_crime['Dates'].dt.month
sf_crime['Day_of_Week'] = sf_crime['Dates'].dt.day
#check the first couple rows to make sure it's what you want
sf_crime.head(2)

Unnamed: 0,Dates,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13


In [13]:
# create a column for the 'Hour','Time', and 'Date'
sf_crime['Hour'] =sf_crime['Dates'].dt.hour
sf_crime['Time'] = sf_crime['Dates'].dt.time
sf_crime['Date'] = sf_crime['Dates'].dt.date
sf_crime.head(2)

Unnamed: 0,Dates,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week,Hour,Time,Date
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,23:53:00,2015-05-13
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,23:53:00,2015-05-13


In [14]:
# Drop the 'Dates' column
sf_crime=sf_crime.drop('Dates', axis=1)
sf_crime.head(2)


Unnamed: 0,Category,Descript,PdDistrict,Resolution,Address,X,Y,Year,Month,Day_of_Week,Hour,Time,Date
0,WARRANTS,WARRANT ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,23:53:00,2015-05-13
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,2015,5,13,23,23:53:00,2015-05-13


In [15]:
# check the 'Category' value counts to see what sort of categories there are
# and to see if anything might require cleaning (particularly the ones with fewer values)

sf_crime['Category'].value_counts()

LARCENY/THEFT                  4885
OTHER OFFENSES                 2291
NON-CRIMINAL                   2255
ASSAULT                        1536
VEHICLE THEFT                   967
VANDALISM                       877
BURGLARY                        732
WARRANTS                        728
SUSPICIOUS OCC                  592
MISSING PERSON                  535
DRUG/NARCOTIC                   496
ROBBERY                         465
FRAUD                           363
SECONDARY CODES                 261
WEAPON LAWS                     212
TRESPASS                        130
STOLEN PROPERTY                 111
SEX OFFENSES FORCIBLE           103
FORGERY/COUNTERFEITING           85
DRUNKENNESS                      74
KIDNAPPING                       50
PROSTITUTION                     44
DRIVING UNDER THE INFLUENCE      42
DISORDERLY CONDUCT               37
ARSON                            35
LIQUOR LAWS                      25
RUNAWAY                          16
BRIBERY                     

In [16]:
m = sf_crime['Category'] == 'ASSUALT'
sf_crime.loc[m,'Category'] = sf_crime.loc[m,'Category'].astype(str).replace('ASSUALT','ASSAULT',regex=True)

m = sf_crime['Category'] == 'ASSUALT'
sf_crime.loc[m,'Category'] = sf_crime.loc[m,'Category'].astype(str).replace('TRESPASSING','TRESPASS',regex=True)

In [17]:
# have a look to see whether you have all the days of the week in your data
sf_crime['Day_of_Week'].unique()

array([13, 12, 11,  3,  2,  1, 30, 29, 28, 27, 19, 18, 17, 16, 15, 14,  5,
        4, 31, 22, 21, 20,  8,  7,  6])

In [18]:
# have a look at the value counts for 'Descript', 'PdDistrict', and 'Resolution' to make sure it all checks out
print('Descript\n',sf_crime['Descript'].value_counts())

print('PdDistrict\n',sf_crime['PdDistrict'].value_counts())

print('Resolution\n',sf_crime['Resolution'].value_counts())

Descript
 GRAND THEFT FROM LOCKED AUTO                  2127
STOLEN AUTOMOBILE                              625
AIDED CASE, MENTAL DISTURBED                   591
DRIVERS LICENSE, SUSPENDED OR REVOKED          589
BATTERY                                        520
                                              ... 
ATTEMPTED KIDNAPPING, JUVENILE VICTIM            1
ATTEMPTED ROBBERY CHAIN STORE WITH A KNIFE       1
TRANSPORTAION OF CONTROLLED SUBSTANCE            1
EMBEZZLEMENT, GRAND THEFT LEASED PROPERTY        1
ARMOR PENETRATING AMMUNITION, POSSESSION         1
Name: Descript, Length: 510, dtype: int64
PdDistrict
 SOUTHERN      3287
NORTHERN      2250
CENTRAL       2206
MISSION       2118
BAYVIEW       1678
INGLESIDE     1628
TARAVAL       1426
TENDERLOIN    1327
RICHMOND      1101
PARK           979
Name: PdDistrict, dtype: int64
Resolution
 NONE                                      12862
ARREST, BOOKED                             4455
UNFOUNDED                                   36

In [19]:
# use .describe() to see whether the location coordinates seem appropriate
sf_crime.describe()

Unnamed: 0,X,Y,Year,Month,Day_of_Week,Hour
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,-122.423639,37.768466,2015.0,3.489944,14.290167,13.646833
std,0.026532,0.024391,0.0,0.868554,8.955835,6.53904
min,-122.513642,37.708154,2015.0,2.0,1.0,0.0
25%,-122.434199,37.753838,2015.0,3.0,5.0,10.0
50%,-122.416949,37.775608,2015.0,3.0,16.0,15.0
75%,-122.406539,37.78539,2015.0,4.0,20.0,19.0
max,-122.365565,37.819923,2015.0,5.0,31.0,23.0


In [20]:
NVC = ['BAD CHECKS','BRIBERY','DRUG/NARCOTIC','DRUNKENNESS',
     'EMBEZZLEMENT','FORGERY/COUNTERFEITING','FRAUD',
     'GAMBLING','LIQUOR','LOITERING','TRESPASS','OTHER OFFENSES']

NOT_C = ['NON-CRIMINAL','RUNAWAY','SECONDARY CODES','SUSPICIOUS OCC','WARRANTS']

#use a list comprehension to get all the categories in sf_crime['Category'].unique() that are NOT in the lists above

VC = []
VC = [x for x in sf_crime['Category'] if x not in (NVC ,NOT_C)]
VC

['WARRANTS',
 'OTHER OFFENSES',
 'OTHER OFFENSES',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'VEHICLE THEFT',
 'VEHICLE THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'OTHER OFFENSES',
 'VANDALISM',
 'LARCENY/THEFT',
 'NON-CRIMINAL',
 'NON-CRIMINAL',
 'ROBBERY',
 'ASSAULT',
 'OTHER OFFENSES',
 'NON-CRIMINAL',
 'LARCENY/THEFT',
 'ROBBERY',
 'WARRANTS',
 'NON-CRIMINAL',
 'LARCENY/THEFT',
 'NON-CRIMINAL',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'OTHER OFFENSES',
 'LARCENY/THEFT',
 'NON-CRIMINAL',
 'VANDALISM',
 'LARCENY/THEFT',
 'VANDALISM',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'WEAPON LAWS',
 'VANDALISM',
 'NON-CRIMINAL',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'OTHER OFFENSES',
 'OTHER OFFENSES',
 'OTHER OFFENSES',
 'VEHICLE THEFT',
 'LARCENY/THEFT',
 'NON-CRIMINAL',
 'BURGLARY',
 'ROBBERY',
 'ASSAULT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'SUSPICIOUS OCC',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',
 'LARCENY/THEFT',


In [22]:
#add a column called 'Type' into your dataframe that stores whether the observation was:
#Non-Violent, Violent, or Non-Crime
#use .map()!
def typecrime(x):
    if x in NOT_C: return 'NOT_CRIMINAL'
    if x in NVC: return 'NON-VIOLENT'
    if x in VC: return 'VIOLENT_CRIME'

sf_crime['Type']=sf_crime['Category'].map(typecrime)

In [23]:
#find the baseline accuracy:
sf_crime['Type'].value_counts().max()/len(sf_crime)

0.5931666666666666

In [24]:
#create a target array with 'Type'
#create a predictor matrix with 'Day_of_Week','Month','Year','PdDistrict','Hour', and 'Resolution'
y = sf_crime['Type']
X = sf_crime[['Day_of_Week','Month','Year','PdDistrict','Hour','Resolution']]

In [25]:
#use pd.get_dummies() to dummify your categorical variables
#remember to drop a column!
X = pd.get_dummies(X,drop_first=True)

In [26]:
X.head()

Unnamed: 0,Day_of_Week,Month,Year,Hour,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,"Resolution_ARREST, CITED",Resolution_CLEARED-CONTACT JUVENILE FOR MORE INFO,Resolution_EXCEPTIONAL CLEARANCE,Resolution_JUVENILE BOOKED,Resolution_LOCATED,Resolution_NONE,Resolution_NOT PROSECUTED,Resolution_PSYCHOPATHIC CASE,Resolution_UNFOUNDED
0,13,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,13,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,13,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,13,5,2015,23,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,13,5,2015,23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0


In [27]:
#create a 50/50 train test split; 
#stratify based on your target variable
#use a random state of 2018
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.5, stratify=y, random_state=2018)

In [28]:
#standardise your predictor matrices
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
ss.fit(X_train)
X_train_ss = ss.transform(X_train)
X_test_ss = ss.transform(X_test)

In [29]:
#create a default Logistic Regression model and find its mean cross-validated accuracy with your training data
#use 5 cross-validation folds
lr = LogisticRegression()
cross_val_score(lr, X_train_ss, y_train, cv=5).mean()

0.6324444444444444

In [30]:
#create a confusion matrix with cross_val_predict
predictions = cross_val_predict(lr, X_train_ss, y_train, cv=5)
confusion = confusion_matrix(y_test,predictions)
pd.DataFrame(confusion,columns=sorted(y_train.unique()),index=sorted(y_train.unique()))

Unnamed: 0,NON-VIOLENT,NOT_CRIMINAL,VIOLENT_CRIME
NON-VIOLENT,345,40,1350
NOT_CRIMINAL,425,48,1453
VIOLENT_CRIME,1032,127,4180


In [31]:
y_train.value_counts()

VIOLENT_CRIME    5338
NOT_CRIMINAL     1926
NON-VIOLENT      1736
Name: Type, dtype: int64

In [32]:
#create a hyperparameter dictionary for a logistic regression
crime_gs_params={'penalty':['l1','l2'],
                 'solver':['liblinear'],
                 'C':np.logspace(-3,0,50)}

In [33]:
#create a gridsearch object using LogisticRegression() and the dictionary you created above
crime_gs=GridSearchCV(LogisticRegression(),
                      crime_gs_params,
                      n_jobs=-1,cv=5)

In [34]:
#fit the gridsearch object on your training data
crime_gs.fit(X_train_ss,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.001     , 0.0011514 , 0.00...
       0.03393222, 0.0390694 , 0.04498433, 0.05179475, 0.05963623,
       0.06866488, 0.07906043, 0.09102982, 0.10481131, 0.12067926,
       0.13894955, 0.15998587, 0.184207  , 0.21209509, 0.24420531,
       0.28117687, 0.32374575, 0.37275937, 0.42919343, 0.49417134,
       0.5689866 

In [35]:
#print out the best parameters
crime_gs.best_params_

{'C': 0.010985411419875584, 'penalty': 'l1', 'solver': 'liblinear'}

In [36]:
#print out the best mean cross-validated score
crime_gs.best_score_

0.6327777777777778

In [37]:
#assign your best estimator to the variable 'best_logreg'
best_logreg=crime_gs.best_estimator_

In [38]:
#score your model on your testing data
best_logreg.score(X_test_ss,y_test)

0.6335555555555555

In [39]:
#use your test data to create your classification report
predictions = best_logreg.predict(X_test_ss)
print(classification_report(y_test, predictions))

               precision    recall  f1-score   support

  NON-VIOLENT       0.45      0.60      0.52      1735
 NOT_CRIMINAL       0.64      0.07      0.12      1926
VIOLENT_CRIME       0.70      0.85      0.77      5339

     accuracy                           0.63      9000
    macro avg       0.60      0.50      0.47      9000
 weighted avg       0.64      0.63      0.58      9000

