In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn import linear_model
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
import ny_crime2013 as ny

## California Crime Dataset

In [64]:
ca_data = pd.read_excel("table_8_offenses_known_to_law_enforcement_california_by_city_2013.xls",skiprows=4)
ca_data.head(5)

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition)1,Rape (legacy definition)2,Robbery,Aggravated assault,Property crime,Burglary,Larceny- theft,Motor vehicle theft,Arson
0,Adelanto,31165.0,198.0,2.0,,15.0,52.0,129.0,886.0,381.0,372.0,133.0,17.0
1,Agoura Hills,20762.0,19.0,0.0,,2.0,10.0,7.0,306.0,109.0,185.0,12.0,7.0
2,Alameda,76206.0,158.0,0.0,,10.0,85.0,63.0,1902.0,287.0,1285.0,330.0,17.0
3,Albany,19104.0,29.0,0.0,,1.0,24.0,4.0,557.0,94.0,388.0,75.0,7.0
4,Alhambra,84710.0,163.0,1.0,,9.0,81.0,72.0,1774.0,344.0,1196.0,234.0,7.0


In [65]:
ca_data["Population2"] = np.square(ca_data["Population"])

In [66]:
ca_data = ca_data.rename(columns={"Property\ncrime":"PropertyCrime","Murder and\nnonnegligent\nmanslaughter":"Murder"})

In [67]:
# isolate select features to match NY model
ca_crime = pd.DataFrame()
ca_crime = ca_data[["PropertyCrime","Population","Murder","Robbery","Population2"]]
ca_crime.head(5)

Unnamed: 0,PropertyCrime,Population,Murder,Robbery,Population2
0,886.0,31165.0,2.0,52.0,971257200.0
1,306.0,20762.0,0.0,10.0,431060600.0
2,1902.0,76206.0,0.0,85.0,5807354000.0
3,557.0,19104.0,0.0,24.0,364962800.0
4,1774.0,84710.0,1.0,81.0,7175784000.0


In [68]:
# check and remove null values
ca_crime.isnull().sum()

PropertyCrime    2
Population       2
Murder           2
Robbery          2
Population2      2
dtype: int64

In [69]:
ca_crime = ca_crime.dropna(how="all")
ca_crime.isnull().sum()

PropertyCrime    0
Population       0
Murder           0
Robbery          0
Population2      0
dtype: int64

## Re-run NY crime dataset 

In [74]:
# need to retrain model using training and test sets
from sklearn.cross_validation import cross_val_score,cross_val_predict
from sklearn import metrics

ny_col = ["Population","Murder","Robbery","Population^2"]

# create model and fit training set to model
ny_lm = linear_model.LinearRegression()
X = ny.crime_model[ny_col]
y = ny.crime_model["Property Crime"]
ny_lm.fit(X,y)


ny_scores = cross_val_score(ny_lm,X,y,cv=6)
print("Cross Validated Scores (NY): ",ny_scores)

Cross Validated Scores (NY):  [0.89280965 0.70669796 0.5787358  0.74799954 0.80088199 0.80309054]


In [75]:
# accuracy of ny model 
ny_pred = cross_val_predict(ny_lm,X,y,cv=6)
ny_accuracy = metrics.r2_score(y,ny_pred)
print("Cross-Predicted Accuracy (R-squared,NY): ",ny_accuracy)

Cross-Predicted Accuracy (R-squared,NY):  0.7741292753071796


## Validate NY crime model with CA data (Cross-Validation)

In [76]:
ca_col = ["Population","Murder","Robbery","Population2"]
X2 = ca_crime[ca_col]
y2 = ca_crime["PropertyCrime"]

ca_pred = cross_val_predict(ny_model,X2,y2,cv=6)
ca_accuracy = metrics.r2_score(y2,ca_pred)
print("Cross-Predicted Accuracy (R-squared,CA): ",ca_accuracy)

Cross-Predicted Accuracy (R-squared,CA):  0.8788907986651734
