In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [2]:
# Import
il_data = pd.read_excel('ill_crime_copy.xls', header=1, skiprows=3)

In [3]:
# Clean
il_data = il_data.rename(columns={'Violent\ncrime':'Violent_crime',
                    'Murder and\nnonnegligent\nmanslaughter':'Murder',
                    'Aggravated\nassault':'Aggravated_assault',
                    'Motor\nvehicle\ntheft':'Motor_vehicle_theft',
                    'Rape\n(revised\ndefinition)1':'Rape_revised',
                    'Rape\n(legacy\ndefinition)2':'Rape_legacy',
                    'Larceny-\ntheft':'Larceny_theft',
                    'Property\ncrime':'Property_crime'
})
il_data = il_data.drop(columns=['Rape_revised'])

I have decided that violent crime will be the binary outcome variable and based on that stat, I will assign the cities to either "dangerous" or "safe."

In [4]:
# Create outcome variable
def binarize_crime(vcrime):
    '''Cities with more than 10 violent crimes are dangerous'''
    if vcrime > 10:
        return 'Dangerous'
    else:
        return 'Safe'
    
il_data['Bi_violent_crime'] = il_data['Violent_crime'].apply(lambda x: binarize_crime(x))

Ok now I need to turn these into features, do logistic regression, logistic ridge regression, and logistic lasso regression and decide which one works the best.

In order to do this I need to figure out how to format my outcome variable because the assignment seems to say I need to convert it from the binary variables to the odds. But I don't see how to do that.

In [5]:
# Population squared column and clean some more
il_data['Pop_squared'] = il_data['Population']**2
il_data = il_data.dropna()

In [6]:
# X and Y for my logistic regression
X = il_data.drop(columns=['Violent_crime','Bi_violent_crime', 'City'])
Y = il_data['Bi_violent_crime']

In [7]:
# Creating a logistic regression
logreg = LogisticRegression()
fit = logreg.fit(X, Y)

In [8]:
# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = logreg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(logreg.score(X, Y))

Coefficients
[[-4.42288999e-14 -7.61398630e-18 -1.08110020e-16 -2.91168898e-16
  -6.24933886e-16 -5.42553337e-15 -1.21702143e-15 -3.93973235e-15
  -2.68779585e-16 -3.15969056e-17 -2.83126437e-09]]
[3.9721406e-17]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               236   268

Percentage accuracy
0.46825396825396826


In [9]:
# Regression with L2 penalty (same thing as above)
ridgereg = LogisticRegression(penalty='l2')
ridgid = ridgereg.fit(X, Y)

In [10]:
print("Coefficients:")
print(ridgid.coef_)
print(ridgid.intercept_)
pred_y_sklearn = ridgereg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(ridgid.score(X, Y))

Coefficients:
[[-4.42288999e-14 -7.61398630e-18 -1.08110020e-16 -2.91168898e-16
  -6.24933886e-16 -5.42553337e-15 -1.21702143e-15 -3.93973235e-15
  -2.68779585e-16 -3.15969056e-17 -2.83126437e-09]]
[3.9721406e-17]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               236   268

Percentage accuracy
0.46825396825396826


In [11]:
# Regression wtih L1 penalty. Doesn't call all dangerous for first time.
lassoreg = LogisticRegression(penalty='l1')
lasseau = lassoreg.fit(X, Y)

In [12]:
print("Coefficients:")
print(lasseau.coef_)
print(lasseau.intercept_)
pred_y_sklearn = lassoreg.predict(X)

print('\nAccuracy by Danger level')
print(pd.crosstab(pred_y_sklearn, Y))

print('\nPercentage accuracy')
print(lasseau.score(X, Y))

Coefficients:
[[-4.28218242e-05  0.00000000e+00 -8.29862464e-01 -8.61586662e-01
  -7.47093398e-01 -5.31089531e-04 -9.27243634e-04  2.78716697e-03
   1.14479595e-01  1.21007936e-01 -2.56048285e-10]]
[7.73602408]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               234     2
Safe                      2   266

Percentage accuracy
0.9920634920634921


#### Gotta try it with different data too

In [13]:
ny_data = pd.read_csv('ny_crime_copy.csv', skiprows=3, header=1)
ny_data = ny_data.rename(columns={'Violent\ncrime':'Violent_crime',
                    'Murder and\nnonnegligent\nmanslaughter':'Murder',
                    'Aggravated\nassault':'Aggravated_assault',
                    'Motor\nvehicle\ntheft':'Motor_vehicle_theft',
                    'Rape\n(revised\ndefinition)1':'Rape_revised',
                    'Rape\n(legacy\ndefinition)2':'Rape_legacy',
                    'Larceny-\ntheft':'Larceny_theft',
                    'Property\ncrime':'Property_crime',
                    'Arson3':'Arson'
})
ny_data = ny_data.drop(columns=['Rape_revised'])

In [14]:
# I am going to assume that if there is no reporting that is because there was no crime.
ny_data.Arson = ny_data.Arson.fillna(value=0)

# If missing population or outcome variable, I want to drop that row because it's too important
ny_data = ny_data.dropna(subset=['Population', 'Violent_crime'])

In [15]:
dtype_float = ['Violent_crime', 'Aggravated_assault', 'Motor_vehicle_theft', 'Rape_legacy', 'Larceny_theft', 'Property_crime', 'Robbery', 'Population', 'Burglary']

def remove_commas(number):
    if ',' in number:
        return number.replace(',', '')
    else:
        return number
    
for item in dtype_float:
    ny_data[item] = ny_data[item].apply(lambda x: remove_commas(x))
    ny_data[item] = ny_data[item].apply(lambda x: float(x))
# Gotta fix all the ny_data dtypes

In [16]:
# Adding 'dangerous'/'safe' outcome variable
ny_data['Bi_violent_crime'] = ny_data['Violent_crime'].apply(lambda x: binarize_crime(x))

In [17]:
ny_data['Pop_squared'] = ny_data['Population']**2

In [18]:
X1 = ny_data.drop(columns=['Violent_crime','Bi_violent_crime', 'City'])
Y1 = ny_data['Bi_violent_crime']

In [19]:
# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
ny_pred_y_sklearn = logreg.predict(X1)

print('\nAccuracy by Danger level')
print(pd.crosstab(ny_pred_y_sklearn, Y1))

print('\nPercentage accuracy')
print(logreg.score(X1, Y1))

Coefficients
[[-4.42288999e-14 -7.61398630e-18 -1.08110020e-16 -2.91168898e-16
  -6.24933886e-16 -5.42553337e-15 -1.21702143e-15 -3.93973235e-15
  -2.68779585e-16 -3.15969056e-17 -2.83126437e-09]]
[3.9721406e-17]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               141   207

Percentage accuracy
0.4051724137931034


In [20]:
# Let's see if lasso is much better for ny data as well
print('Coefficients')
print(lasseau.coef_)
print(lasseau.intercept_)
ny_pred_y_sklearn = lassoreg.predict(X1)

print('\nAccuracy by Danger level')
print(pd.crosstab(ny_pred_y_sklearn, Y1))

print('\nPercentage accuracy')
print(lassoreg.score(X1, Y1))

Coefficients
[[-4.28218242e-05  0.00000000e+00 -8.29862464e-01 -8.61586662e-01
  -7.47093398e-01 -5.31089531e-04 -9.27243634e-04  2.78716697e-03
   1.14479595e-01  1.21007936e-01 -2.56048285e-10]]
[7.73602408]

Accuracy by Danger level
Bi_violent_crime  Dangerous  Safe
row_0                            
Dangerous               141     2
Safe                      0   205

Percentage accuracy
0.9942528735632183


## What happened:

I used Illinois data because that is my home state. I cleaned it up, added a population^2 column and ran logistic, ridge, and lasso regression on it. Ridge and logistic are actually the same thing because of the default value for "penalty" in logistic regression. They both predicted that every city was "dangerous." Somehow Lasso was 99% accurate. I assume this was the design of the assignment.

Then I imported New York data and tested my Illinois predictors on the ny_data and ran into the same results. The Ridge and Logistic both predicted "dangerous" for every city in NY but the Lasso regression was remarkably accurate.

I take this assignment as a good example of a certain type of regression-classifier (in this case Lasso) outshining the others.