In [1]:
import numpy as np
import pandas as pd
import psycopg2

In [2]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

In [4]:
query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2014-01-01' AND '2016-12-31';
'''

In [5]:
df = pd.read_sql(query, con=conn)
df.head(3).T

Unnamed: 0,0,1,2
incident_id,73180647,73180648,73180649
victim_id,79599709,79599710,79599711
date_part,2014,2014,2014
victim_type_id,4,4,4
victim_type,Individual,Individual,Individual
age_num,35,60,38
age_range_low,,,
age_range_high,,,
age_id,5,5,5
age_code,AG,AG,AG


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 25 columns):
incident_id               478607 non-null int64
victim_id                 478607 non-null int64
date_part                 478607 non-null float64
victim_type_id            478607 non-null int64
victim_type               478607 non-null object
age_num                   469263 non-null float64
age_range_low             184635 non-null float64
age_range_high            184635 non-null float64
age_id                    478607 non-null int64
age_code                  478607 non-null object
age_name                  478607 non-null object
victim_sex                478607 non-null object
crime_against             478607 non-null object
offense                   478607 non-null object
offense_category          478607 non-null object
offense_group             478607 non-null object
location_id               478607 non-null int64
location_name             478607 non-null object
fips    

### Random Forest Regressor

In [7]:
model_df = df[['date_part','age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

model_df = pd.get_dummies(model_df, columns = ['victim_sex','crime_against','offense_category','location_id'])
model_df.head()

Unnamed: 0,date_part,age_num,population,officers,civilians,county,victim_sex_F,victim_sex_M,victim_sex_U,crime_against_Not a Crime,...,location_id_38,location_id_39,location_id_40,location_id_41,location_id_42,location_id_43,location_id_44,location_id_45,location_id_46,location_id_47
0,2014.0,35.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014.0,60.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014.0,38.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014.0,66.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014.0,57.0,71780,144,34,BELL,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 81 columns):
date_part                                                    478607 non-null float64
age_num                                                      469263 non-null float64
population                                                   478607 non-null int64
officers                                                     478607 non-null int64
civilians                                                    478607 non-null int64
county                                                       478607 non-null object
victim_sex_F                                                 478607 non-null uint8
victim_sex_M                                                 478607 non-null uint8
victim_sex_U                                                 478607 non-null uint8
crime_against_Not a Crime                                    478607 non-null uint8
crime_against_Person                                  

### Drop missing age rows (<2% of data)

In [9]:
model_df.dropna(axis=0, subset=['age_num'],inplace=True)

In [None]:
model_df.info()

In [None]:
model_df.corr()

In [None]:
y = model_df.pop('county').values
#y = model_df.pop('fips').values
X = model_df.values

In [None]:
def standard_confusion_matrix(y_true, y_predict):
    cm = confusion_matrix(y_true, y_predict)
    cm = np.rot90(cm, k=1, axes=(0, 1))
    cm = np.flip(cm, axis=1)
    return cm

In [None]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, auc, roc_curve

In [None]:
clf = RandomForestClassifier(oob_score=True, n_estimators=1000, max_depth=40, max_features='auto')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)

# generate confusion matrix

cm = standard_confusion_matrix(y_test,preds)

df_cm = pd.DataFrame(cm, index = ['True (positive)', 'True (negative)'])
df_cm.columns = ['Predicted (positive)', 'Predicted (negative)']

sns.heatmap(df_cm, annot=True, fmt="d");

In [None]:
for feat, importance in zip(df.columns, clf.feature_importances_):
    print 'feature: {f}, importance: {i}'.format(f=feat, i=importance)