In [None]:
import numpy as np
import pandas as pd
import psycopg2

In [None]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [None]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

In [None]:
train_query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2013-01-01' AND '2015-12-31';
'''

In [None]:
train_df = pd.read_sql(train_query, con=conn)
train_df.head(3).T

In [None]:
train_df.info()

In [None]:
test_query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2016-01-01' AND '2016-12-31';
'''

In [None]:
test_df = pd.read_sql(test_query, con=conn)
test_df.head(3).T

In [None]:
test_df.info()

### Random Forest Regressor

In [None]:
train_model_df = train_df[['age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

train_model_df = pd.get_dummies(train_model_df, columns = ['victim_sex','crime_against','offense_category','location_id'])
train_model_df.head()

In [None]:
test_model_df = test_df[['age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

test_model_df = pd.get_dummies(test_model_df, columns = ['victim_sex','crime_against','offense_category','location_id'])
test_model_df.head()

### Clean Training and Test Data: Drop missing age rows (<1% of data)

In [None]:
train_model_df.dropna(axis=0, subset=['age_num'],inplace=True)
test_model_df.dropna(axis=0, subset=['age_num'],inplace=True)

In [None]:
# Training and Testing Sets
y_train = train_model_df.pop('county').values
X_train = train_model_df.values
y_test = test_model_df.pop('county').values
X_test = test_model_df.values

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, auc, roc_curve, precision_recall_curve, precision_recall_fscore_support

In [None]:
clf = RandomForestClassifier(oob_score=True, n_estimators=500, max_depth=50, max_features='auto')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

In [None]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

prfs_micro = precision_recall_fscore_support(y_test, preds, average='micro')
prfs_macro = precision_recall_fscore_support(y_test, preds, average='macro')
prfs_weighted = precision_recall_fscore_support(y_test, preds, average='weighted')

#Prec, Rec, Thresh = precision_recall_curve(y_test, proba_preds)

print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)
print('PRF_Support Scores:', prfs_micro, prfs_macro, prfs_weighted)
#print(Prec, Rec, Thresh)

In [None]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(model_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats