In [80]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn import linear_model
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")

In [81]:
df = pd.read_excel('table_8_offenses_known_to_law_enforcement_new_york_by_city_2013.xls')

#Dropping the first few rows
df = df.drop([0, 1, 2, 3, 4])

#Drop the blank column
df = df.drop(columns='Unnamed: 4')

df.head()

Unnamed: 0,Table 8,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
5,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0
6,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0
7,Albany,97956,791,8,30,227,526,4090,705,3243,142,
8,Albion Village,6388,23,0,3,4,16,223,53,165,5,
9,Alfred Village,4089,5,0,0,3,2,46,10,36,0,


In [82]:
df.columns = ['city', 'population', 'violent_crime', 'murder_manslaughter', 'rape',
             'robbery', 'agg_assault', 'property_crime', 'burglary', 'larceny', 'motor_theft', 'arson']

df.head(30)

Unnamed: 0,city,population,violent_crime,murder_manslaughter,rape,robbery,agg_assault,property_crime,burglary,larceny,motor_theft,arson
5,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0
6,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0
7,Albany,97956,791,8,30,227,526,4090,705,3243,142,
8,Albion Village,6388,23,0,3,4,16,223,53,165,5,
9,Alfred Village,4089,5,0,0,3,2,46,10,36,0,
10,Allegany Village,1781,3,0,0,0,3,10,0,10,0,0.0
11,Amherst Town,118296,107,1,7,31,68,2118,204,1882,32,3.0
12,Amityville Village,9519,9,0,2,4,3,210,16,188,6,1.0
13,Amsterdam,18182,30,0,0,12,18,405,99,291,15,0.0
14,Arcade Village,2053,0,0,0,0,0,39,3,35,1,0.0


In [85]:
#Replacing NaN values with 0
#df.arson[df.arson.isnull()] = 0

#Replacing NaN values with rounded mean
df.arson[df.arson.isnull()] = round(df.arson.mean(),0)

df.arson.mean()

1.937142857142857

In [86]:
df.head(30)

Unnamed: 0,city,population,violent_crime,murder_manslaughter,rape,robbery,agg_assault,property_crime,burglary,larceny,motor_theft,arson
5,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0
6,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0
7,Albany,97956,791,8,30,227,526,4090,705,3243,142,2.0
8,Albion Village,6388,23,0,3,4,16,223,53,165,5,2.0
9,Alfred Village,4089,5,0,0,3,2,46,10,36,0,2.0
10,Allegany Village,1781,3,0,0,0,3,10,0,10,0,0.0
11,Amherst Town,118296,107,1,7,31,68,2118,204,1882,32,3.0
12,Amityville Village,9519,9,0,2,4,3,210,16,188,6,1.0
13,Amsterdam,18182,30,0,0,12,18,405,99,291,15,0.0
14,Arcade Village,2053,0,0,0,0,0,39,3,35,1,0.0


In [93]:
#I see a handful of NaNs (particularly in arson), let's remove that one too
#df = df.drop(columns='arson')

In [94]:
df['population_squared'] = df['population'] **2

df.head()

Unnamed: 0,city,population,violent_crime,murder_manslaughter,rape,robbery,agg_assault,property_crime,burglary,larceny,motor_theft,arson,population_squared
5,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0.0,6640929
6,Akron Village,2846,3,0,0,0,3,16,1,15,0,0.0,8099716
7,Albany,97956,791,8,30,227,526,4090,705,3243,142,2.0,9595377936
8,Albion Village,6388,23,0,3,4,16,223,53,165,5,2.0,40806544
9,Alfred Village,4089,5,0,0,3,2,46,10,36,0,2.0,16719921


### Propertycrime=α+Population+Population2+Murder+Robbery

In [95]:
#Attempting to drop all NaN values rows in anticipation of the following error...
#df.dropna()

In [98]:
df.columns

Index(['city', 'population', 'violent_crime', 'murder_manslaughter', 'rape',
       'robbery', 'agg_assault', 'property_crime', 'burglary', 'larceny',
       'motor_theft', 'arson', 'population_squared'],
      dtype='object')

In [106]:
X.describe

#More rows to drop
df = df.drop([352, 353, 354])

#X.isnull().sum()

In [107]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
Y = df['property_crime'].values.reshape(-1, 1)
X = df[['population', 'population_squared', 'murder_manslaughter', 'robbery']]
regr.fit(X, Y)
plt.show()

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


#What's happening here?


Coefficients: 
 [[ 1.59183918e-02 -1.01014349e-09  1.17553360e+02  2.09300109e+00]]

Intercept: 
 [24.32759066]

R-squared:
0.998741790547199


In [115]:
px = X.head(1)
px

Unnamed: 0,population,population_squared,murder_manslaughter,robbery
5,2577,6640929,0,0


In [113]:
regr.predict(px)

array([[65.34257809]])

In [114]:
df.head(1)

Unnamed: 0,city,population,violent_crime,murder_manslaughter,rape,robbery,agg_assault,property_crime,burglary,larceny,motor_theft,arson,population_squared
5,Addison Town and Village,2577,3,0,0,0,3,24,3,20,1,0,6640929
