In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing

In [5]:
import warnings
# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

%matplotlib inline
sns.set_style('white')

In [6]:
# import crime dataset
df = pd.read_csv('offenses_known_to_law_enforcement_california_by_city_2013.csv')

# Clean the data
df = df.drop([0,1,2])
df = df.rename(columns=df.iloc[0])

df = df.drop([3])
df = df.reset_index(drop = True)
df.drop(df.columns[len(df.columns)-1], axis=1, inplace=True)

# Get Rid of Null Values
df = df.dropna(axis =1, how = 'all')
df = df[:-2]

# Change the variables from object to float 
for var in list(df)[1:]:
    df[var] = df[var].str.replace(',','')
    df[var] = pd.to_numeric(df[var]).astype(float)

# Rename the columns
df = df.rename(columns = {'Violent\ncrime':'Violent_Crime',
                          'Murder and\nnonnegligent\nmanslaughter':'Murder',
                          'Rape\n(legacy\ndefinition)2':'Rape',
                          'Aggravated\nassault':'Aggravated_Assault',
                          'Property\ncrime':'Property_Crime',
                          'Larceny-\ntheft':'Larceny_Theft',
                          'Motor\nvehicle\ntheft':'Motor_Theft',
                          'Arson3': 'Arson'})

# Rearrange the columns to have 'Murder' last
cols = df.columns.tolist()
cols = cols[:3] + cols[4:] + cols[3:4]
df = df[cols]

df.head()

Unnamed: 0,City,Population,Violent_Crime,Rape,Robbery,Aggravated_Assault,Property_Crime,Burglary,Larceny_Theft,Motor_Theft,Arson,Murder
0,Adelanto,31165.0,198.0,15.0,52.0,129.0,886.0,381.0,372.0,133.0,17.0,2.0
1,Agoura Hills,20762.0,19.0,2.0,10.0,7.0,306.0,109.0,185.0,12.0,7.0,0.0
2,Alameda,76206.0,158.0,10.0,85.0,63.0,1902.0,287.0,1285.0,330.0,17.0,0.0
3,Albany,19104.0,29.0,1.0,24.0,4.0,557.0,94.0,388.0,75.0,7.0,0.0
4,Alhambra,84710.0,163.0,9.0,81.0,72.0,1774.0,344.0,1196.0,234.0,7.0,1.0


# Vanilla Logistics Regression

    Normalize the features will cause the 'percentage accuracy' to be 1, why is that?
        *Solution: need to normalize the dataset before spliting the training and testing sets.

In [52]:
# Create a new dataframe to work on for Logistic Regression
log_df = df.copy()

# Change Murder to binary variables (target variable)
log_df['Murder'] = log_df['Murder'].apply(lambda x: 1 if x > 0 else 0)

# Define training and test sizes
## Training set consist of 90% of the data
## Test set consist of 10% of the data
testsize = int(log_df.shape[0] / 10)
log_train = log_df.iloc[testsize:,:].copy() 
log_test = log_df.iloc[:testsize,:].copy()

In [53]:
# Declare a Logistic Regression Model
regr = linear_model.LogisticRegression(C=1e9)

X_train = log_train.iloc[:,1:len(list(log_train))-1]     #choose variables that exclude 'City' and 'Murder'
Y_train = log_train['Murder']     # target

# Fit the model
regr.fit(X_train, Y_train)

lr_origparams = regr.coef_
lr_newinter = regr.intercept_

# Display Results
print('Coefficients')
print(lr_origparams)
print('\nIntercept')
print(lr_newinter)

print('\n Percentage Accuracy (training set)')
print(regr.score(X_train, Y_train))

Coefficients
[[ -8.91199019e-06   3.93893990e-03   7.91055196e-04   2.49938830e-03
   -8.82643816e-04   1.02649006e-03  -2.21595976e-03  -1.24618078e-03
    4.48863060e-03  -3.97634000e-04]]

Intercept
[-0.00086997]

 Percentage Accuracy (training set)
0.704326923077


In [54]:
# Test set
X_test = log_test.iloc[:,1:len(list(log_test))-1]
Y_test = log_test['Murder']     # target

print('\n Percentage Accuracy (test set)')
print(regr.score(X_test, Y_test))


 Percentage Accuracy (test set)
0.673913043478


    Note: Want to cross validate

# Ridge Logistic Regression

In [47]:
# Create a new dataframe to work on for Ridge Logistic Regression
ridge_df = df.copy()

# Normalize the variable (exclude 'Murder')
ridge_df = pd.DataFrame(preprocessing.scale(df.iloc[:,1:len(list(df))-1]),
                        columns = ridge_df.columns[1:len(list(df))-1])

# Change Murder to binary variables (target variable)
ridge_df['Murder'] = df['Murder'].apply(lambda x: 1 if x > 0 else 0)


# Define training and test sizes
## Training set consist of 90% of the data
## Test set consist of 10% of the data
testsize = int(ridge_df.shape[0] / 10)
ridge_train = ridge_df.iloc[testsize:,:].copy() 
ridge_test = ridge_df.iloc[:testsize,:].copy()

In [48]:
# Declare a Ridge Logistic Regression Model
ridgeregr = linear_model.LogisticRegression(penalty='l2', fit_intercept=False)     # Tol ~ alpha, in this case

X_train = ridge_train.iloc[:,:-1]
Y_train = ridge_train['Murder']     #target variable

# Fit the Model
ridgeregr.fit(X_train, Y_train)

ridge_orgiparams = ridgeregr.coef_    #the intercept should be equal to zero and can be dropped.

print('Coefficients')
print(ridge_orgiparams)
print('\n')

print('Percentage Accuracy (training set)')
print(ridgeregr.score(X_train, Y_train))

Coefficients
[[ 0.66278423  0.18731495  1.41166109 -0.86865088  0.95415705  0.45684109
   1.40429622 -0.11155247  1.45566837 -0.68004415]]


Percentage Accuracy (training set)
0.752403846154


In [51]:
# Test set
X_test = ridge_test.iloc[:,:-1]
Y_test = ridge_test['Murder']     #target variable

ridge_orgiparams = ridgeregr.coef_    #the intercept should be equal to zero and can be dropped.

print('Percentage Accuracy (test set)')
print(ridgeregr.score(X_test, Y_test))

Percentage Accuracy (test set)
0.630434782609


    What about the alpha value?

# Lasso Logistic Regression

In [41]:
# Create a new dataframe to work on for Ridge Logistic Regression
lasso_df = df.copy()

# Normalize the variable (exclude 'Murder')
lasso_df = pd.DataFrame(preprocessing.scale(df.iloc[:,1:len(list(df))-1]),
                        columns = ridge_df.columns[1:len(list(df))-1])

# Change Murder to binary variables (target variable)
lasso_df['Murder'] = df['Murder'].apply(lambda x: 1 if x > 0 else 0)


# Define training and test sizes
## Training set consist of 90% of the data
## Test set consist of 10% of the data
testsize = int(lasso_df.shape[0] / 10)
lasso_train = lasso_df.iloc[testsize:,:].copy() 
lasso_test = lasso_df.iloc[:testsize,:].copy()

In [42]:
# Declare a Ridge Logistic Regression Model
lassoregr = linear_model.LogisticRegression(penalty='l1')

X_train = lasso_train.iloc[:,:-1]
Y_train = lasso_train['Murder']     #target variable

# Fit the Model
lassoregr.fit(X_train, Y_train)

lasso_orgiparams = lassoregr.coef_
lasso_originter = lassoregr.intercept_

print('Coefficients')
print(lasso_orgiparams)
print('\n')
print('Intercept')
print(lasso_originter)

print('\nPercentage Accuracy (training set)')
print(lassoregr.score(X_train, Y_train))

Coefficients
[[ 0.          0.          2.21182677  0.          1.98582501  0.
   1.07129929  0.          4.52572575]]


Intercept
[ 1.2226799]

Percentage Accuracy (training set)
0.78125


In [50]:
# Test set
X_test = lasso_test.iloc[:,:-1]
Y_test = lasso_test['Murder']     #target variable

print('\nPercentage Accuracy (test set)')
print(lassoregr.score(X_test, Y_test))


Percentage Accuracy (test set)
0.739130434783


    Note to self: Dont fit the model twice!!! Only fit when training the data.

    Note to self: use 'predict()' on test set => get a vector
                    compare the two, how often the results coincide shows the accuracy of the model