In [44]:

import math
import warnings

from IPython.display import display
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
import statsmodels.formula.api as smf
from scipy.stats import mstats
import statsmodels.api as sm

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

In [93]:
df = pd.read_excel("table_8_offenses_known_to_law_enforcement_illinois_by_city_2013.xls", encoding = "ISO-8859-1", skiprows=4)
df.fillna(0, inplace=True)
df = df.drop(df.index[348:])

In [94]:
df.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Addison,37378.0,40.0,1.0,0.0,8.0,5.0,26.0,640.0,97.0,527.0,16.0,5.0
1,Albany,878.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0
2,Albers,1187.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion,1975.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,8.0,22.0,3.0,1.0
4,Algonquin,29980.0,18.0,0.0,0.0,3.0,2.0,13.0,519.0,33.0,475.0,11.0,6.0


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 0 to 347
Data columns (total 13 columns):
City                                    348 non-null object
Population                              348 non-null float64
Violent
crime                           348 non-null float64
Murder and
nonnegligent
manslaughter    348 non-null float64
Rape
(revised
definition)1              348 non-null float64
Rape
(legacy
definition)2               348 non-null float64
Robbery                                 348 non-null float64
Aggravated
assault                      348 non-null float64
Property
crime                          348 non-null float64
Burglary                                348 non-null float64
Larceny-
theft                          348 non-null float64
Motor
vehicle
theft                     348 non-null float64
Arson                                   348 non-null float64
dtypes: float64(12), object(1)
memory usage: 38.1+ KB


In [96]:
df.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property\ncrime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson'],
      dtype='object')

In [97]:
df_temp = df.rename(index=str, columns={"Property\ncrime": "Property_crime"})

In [98]:
df_temp.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property_crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Addison,37378.0,40.0,1.0,0.0,8.0,5.0,26.0,640.0,97.0,527.0,16.0,5.0
1,Albany,878.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0
2,Albers,1187.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albion,1975.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,8.0,22.0,3.0,1.0
4,Algonquin,29980.0,18.0,0.0,0.0,3.0,2.0,13.0,519.0,33.0,475.0,11.0,6.0


In [99]:
df_temp.columns

Index(['City', 'Population', 'Violent\ncrime',
       'Murder and\nnonnegligent\nmanslaughter',
       'Rape\n(revised\ndefinition)1', 'Rape\n(legacy\ndefinition)2',
       'Robbery', 'Aggravated\nassault', 'Property_crime', 'Burglary',
       'Larceny-\ntheft', 'Motor\nvehicle\ntheft', 'Arson'],
      dtype='object')

In [100]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 348 entries, 0 to 347
Data columns (total 13 columns):
City                                    348 non-null object
Population                              348 non-null float64
Violent
crime                           348 non-null float64
Murder and
nonnegligent
manslaughter    348 non-null float64
Rape
(revised
definition)1              348 non-null float64
Rape
(legacy
definition)2               348 non-null float64
Robbery                                 348 non-null float64
Aggravated
assault                      348 non-null float64
Property_crime                          348 non-null float64
Burglary                                348 non-null float64
Larceny-
theft                          348 non-null float64
Motor
vehicle
theft                     348 non-null float64
Arson                                   348 non-null float64
dtypes: float64(12), object(1)
memory usage: 38.1+ KB


In [101]:
df_temp['population_square'] = df_temp['Population'] * df_temp['Population']
df_temp['Robbery_new'] = np.where(df_temp['Robbery']>0, 1, 0)
df_temp['Burglary_new'] = np.where(df_temp['Burglary']>0, 1, 0)
df_temp['murder_new'] = np.where(df_temp.iloc[:,3]>0, 1,0)

In [102]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
Y = df_temp['Property_crime'].values.reshape(-1, 1)
X = df_temp[['Population','population_square','Robbery_new','Burglary_new','murder_new']]
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[ 1.57033446e-02  7.16086132e-09  1.16566025e+02 -4.39256089e+00
   8.63085507e+01]]

Intercept: 
 [-15.02758693]

R-squared:
0.9982872482692162


In [104]:
linear_formula = 'Property_crime ~ Population+population_square+Robbery_new+Burglary_new+murder_new'
lm = smf.ols(formula=linear_formula, data=df_temp).fit()


In [105]:
lm.params

Intercept           -15.028
Population            0.016
population_square     0.000
Robbery_new         116.566
Burglary_new         -4.393
murder_new           86.309
dtype: float64

In [106]:
lm.pvalues

Intercept           0.773
Population          0.000
population_square   0.000
Robbery_new         0.000
Burglary_new        0.936
murder_new          0.015
dtype: float64

In [107]:
lm.rsquared

0.9982872482692162

In [108]:
data = df_temp[['Population']+['population_square']+['Robbery_new']+['Burglary_new']+['murder_new']]
target = df_temp['Property_crime']

In [109]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(regr.fit(X_train, y_train).score(X_test, y_test)))

With 20% Holdout: 0.6785265003860173


In [110]:
X_test.shape

(70, 5)

In [111]:
from sklearn.model_selection import cross_val_score
cross_val_score(regr, X_train, y_train, cv=10)

array([ 0.51744343, -7.22979056,  0.75334354,  0.79625459,  0.44929569,
        0.88808409,  0.69177982,  0.82468982,  0.73275715,  0.56387649])

In [112]:
linear_formula = 'Property_crime ~ Population+population_square'
lm = smf.ols(formula=linear_formula, data=df_temp).fit()

In [113]:
lm.params

Intercept           29.592
Population           0.018
population_square    0.000
dtype: float64

In [114]:
lm.pvalues

Intercept           0.042
Population          0.000
population_square   0.000
dtype: float64

In [115]:
lm.rsquared

0.9981394240477377

In [116]:
data1 = df_temp[['Population']+['population_square']]
target1 = df_temp['Property_crime']

In [117]:
X_train, X_test, y_train, y_test = train_test_split(data1, target1, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(regr.fit(X_train, y_train).score(X_test, y_test)))

With 20% Holdout: 0.6526794536921467


In [118]:
from sklearn.model_selection import cross_val_score
cross_val_score(regr, X_train, y_train, cv=10)

array([  0.47315422, -18.76402901,   0.72145634,   0.81818905,
         0.39093263,   0.88359507,   0.67989447,   0.77285302,
         0.76972455,   0.57383355])