In [41]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [36]:
il_data = pd.read_excel('ill_crime_copy.xls', header=1, skiprows=3)

In [37]:
il_data = il_data.rename(columns={'Violent\ncrime':'Violent_crime',
                    'Murder and\nnonnegligent\nmanslaughter':'Murder',
                    'Aggravated\nassault':'Aggravated_assault',
                    'Motor\nvehicle\ntheft':'Motor_vehicle_theft',
                    'Rape\n(revised\ndefinition)1':'Rape_revised',
                    'Rape\n(legacy\ndefinition)2':'Rape_legacy',
                    'Larceny-\ntheft':'Larceny_theft',
                    'Property\ncrime':'Property_crime'
})
il_data = il_data.drop(columns=['Rape_revised'])

In [38]:
for col in il_data.columns:
    print(col, len(il_data[col].unique()))

City 510
Population 499
Violent_crime 102
Murder 12
Rape_legacy 34
Robbery 57
Aggravated_assault 81
Property_crime 308
Burglary 139
Larceny_theft 283
Motor_vehicle_theft 70
Arson 25


I have decided that violent crime will be the binary outcome variable and based on that stat, I will assign the cities to either "dangerous" or "safe."

In [39]:
def binarize_crime(vcrime):
    '''Cities with more than 10 violent crimes are dangerous'''
    if vcrime > 10:
        return 'Dangerous'
    else:
        return 'Safe'
    
il_data['Bi_violent_crime'] = il_data['Violent_crime'].apply(lambda x: binarize_crime(x))

In [40]:
il_data.head()

Unnamed: 0,City,Population,Violent_crime,Murder,Rape_legacy,Robbery,Aggravated_assault,Property_crime,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,Bi_violent_crime
0,Addison,37378.0,40.0,1.0,8.0,5.0,26.0,640.0,97.0,527.0,16.0,5.0,Dangerous
1,Albany,878.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,Safe
2,Albers,1187.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Safe
3,Albion,1975.0,0.0,0.0,0.0,0.0,0.0,33.0,8.0,22.0,3.0,1.0,Safe
4,Algonquin,29980.0,18.0,0.0,3.0,2.0,13.0,519.0,33.0,475.0,11.0,6.0,Dangerous


Ok now I need to turn these into features, do logistic regression, logistic ridge regression, and logistic lasso regression and decide which one works the best.

In order to do this I need to figure out how to format my outcome variable because the assignment seems to say I need to convert it from the binary variables to the odds. But I don't see how to do that.

In [49]:
features = il_data.drop(columns=['City'])
features = features.dropna()

In [50]:
features['Pop_squared'] = features['Population']**2

X = features.drop(columns=['Bi_violent_crime'])
Y = features['Bi_violent_crime']

In [51]:
# Ok so I could fill these features in with the means or something.
features_na = features.isnull().sum(axis = 0)
features_na.sort_values(ascending=False)

Pop_squared            0
Bi_violent_crime       0
Arson                  0
Motor_vehicle_theft    0
Larceny_theft          0
Burglary               0
Property_crime         0
Aggravated_assault     0
Robbery                0
Rape_legacy            0
Murder                 0
Violent_crime          0
Population             0
dtype: int64

In [53]:
logreg = LogisticRegression()
fit = logreg.fit(X, Y)

In [64]:
# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = logreg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(logreg.score(X, Y))

Coefficients
[[-4.42288999e-14 -1.03182679e-15 -7.61398630e-18 -1.08110020e-16
  -2.91168898e-16 -6.24933886e-16 -5.42553337e-15 -1.21702143e-15
  -3.93973236e-15 -2.68779585e-16 -3.15969056e-17 -2.83126437e-09]]
[3.9721406e-17]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               236   268

Percentage accuracy
0.46825396825396826


In [60]:
ridgereg = LogisticRegression(penalty='l2')
ridgid = ridgereg.fit(X, Y)

In [65]:
print("Coefficients:")
print(ridgid.coef_)
print(ridgid.intercept_)
pred_y_sklearn = logreg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(logreg.score(X, Y))

Coefficients:
[[-4.42288999e-14 -1.03182679e-15 -7.61398630e-18 -1.08110020e-16
  -2.91168898e-16 -6.24933886e-16 -5.42553337e-15 -1.21702143e-15
  -3.93973236e-15 -2.68779585e-16 -3.15969056e-17 -2.83126437e-09]]
[3.9721406e-17]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               236   268

Percentage accuracy
0.46825396825396826


In [66]:
lassoreg = LogisticRegression(penalty='l1')
lasseau = lassoreg.fit(X, Y)

In [67]:
print("Coefficients:")
print(lasseau.coef_)
print(lasseau.intercept_)
pred_y_sklearn = logreg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(logreg.score(X, Y))

Coefficients:
[[-4.91457263e-05 -4.27415169e-01  0.00000000e+00 -6.72379053e-01
  -7.53156852e-01 -6.56726318e-01  3.93596620e-03 -2.74445277e-03
  -1.03134583e-03  1.33023017e-01  1.51956186e-01 -5.69796866e-10]]
[11.04094749]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               236   268

Percentage accuracy
0.46825396825396826
