In [1]:
import numpy as np
import pandas as pd
import psycopg2

In [2]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

In [4]:
query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2014-01-01' AND '2016-12-31';
'''

In [5]:
df = pd.read_sql(query, con=conn)
df.head(3).T

Unnamed: 0,0,1,2
incident_id,73180647,73180648,73180649
victim_id,79599709,79599710,79599711
date_part,2014,2014,2014
victim_type_id,4,4,4
victim_type,Individual,Individual,Individual
age_num,35,60,38
age_range_low,,,
age_range_high,,,
age_id,5,5,5
age_code,AG,AG,AG


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 25 columns):
incident_id               478607 non-null int64
victim_id                 478607 non-null int64
date_part                 478607 non-null float64
victim_type_id            478607 non-null int64
victim_type               478607 non-null object
age_num                   469263 non-null float64
age_range_low             184635 non-null float64
age_range_high            184635 non-null float64
age_id                    478607 non-null int64
age_code                  478607 non-null object
age_name                  478607 non-null object
victim_sex                478607 non-null object
crime_against             478607 non-null object
offense                   478607 non-null object
offense_category          478607 non-null object
offense_group             478607 non-null object
location_id               478607 non-null int64
location_name             478607 non-null object
fips    

### Random Forest Regressor

In [7]:
model_df = df[['date_part','age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

model_df = pd.get_dummies(model_df, columns = ['victim_sex','crime_against','offense_category','location_id'])
model_df.head()

Unnamed: 0,incident_id,victim_id,date_part,victim_type_id,age_num,location_id,population,officers,civilians,county,...,offense_category_Human Trafficking,offense_category_Kidnapping/Abduction,offense_category_Larceny/Theft Offenses,offense_category_Motor Vehicle Theft,offense_category_Pornography/Obscene Material,offense_category_Prostitution Offenses,offense_category_Robbery,offense_category_Sex Offenses,offense_category_Stolen Property Offenses,offense_category_Weapon Law Violations
0,73180647,79599709,2014.0,4,35.0,13,71780,144,34,BELL,...,0,0,0,0,0,0,0,0,0,0
1,73180648,79599710,2014.0,4,60.0,20,71780,144,34,BELL,...,0,0,0,0,0,0,0,0,0,0
2,73180649,79599711,2014.0,4,38.0,8,71780,144,34,BELL,...,0,0,1,0,0,0,0,0,0,0
3,73180650,79599712,2014.0,4,66.0,5,71780,144,34,BELL,...,0,0,0,0,0,0,0,0,0,0
4,73180653,79599715,2014.0,4,57.0,5,71780,144,34,BELL,...,0,0,0,0,0,0,0,0,0,0


In [8]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 39 columns):
incident_id                                                  478607 non-null int64
victim_id                                                    478607 non-null int64
date_part                                                    478607 non-null float64
victim_type_id                                               478607 non-null int64
age_num                                                      469263 non-null float64
location_id                                                  478607 non-null int64
population                                                   478607 non-null int64
officers                                                     478607 non-null int64
civilians                                                    478607 non-null int64
county                                                       478607 non-null object
victim_sex_F                                          

### Drop missing age rows (<2% of data)

In [9]:
model_df.dropna(axis=0, subset=['age_num'],inplace=True)

In [10]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 469263 entries, 0 to 478606
Data columns (total 39 columns):
incident_id                                                  469263 non-null int64
victim_id                                                    469263 non-null int64
date_part                                                    469263 non-null float64
victim_type_id                                               469263 non-null int64
age_num                                                      469263 non-null float64
location_id                                                  469263 non-null int64
population                                                   469263 non-null int64
officers                                                     469263 non-null int64
civilians                                                    469263 non-null int64
county                                                       469263 non-null object
victim_sex_F                                          

In [11]:
model_df.corr()

Unnamed: 0,incident_id,victim_id,date_part,victim_type_id,age_num,location_id,population,officers,civilians,victim_sex_F,...,offense_category_Human Trafficking,offense_category_Kidnapping/Abduction,offense_category_Larceny/Theft Offenses,offense_category_Motor Vehicle Theft,offense_category_Pornography/Obscene Material,offense_category_Prostitution Offenses,offense_category_Robbery,offense_category_Sex Offenses,offense_category_Stolen Property Offenses,offense_category_Weapon Law Violations
incident_id,1.0,0.999976,0.965259,,-0.002664,0.002887,0.010628,0.014611,0.013518,-0.002744,...,0.008413,0.005211,-0.013121,0.006984,0.002893,0.002258,0.006164,0.005783,0.003455,0.011189
victim_id,0.999976,1.0,0.965201,,-0.00263,0.002837,0.008841,0.012889,0.011888,-0.002757,...,0.008429,0.005352,-0.013269,0.006983,0.002915,0.002237,0.006153,0.005811,0.003494,0.011182
date_part,0.965259,0.965201,1.0,,0.001027,0.008024,-0.031618,-0.030453,-0.031313,-0.004555,...,0.004614,0.004182,-0.004021,0.003703,0.002938,0.000205,0.001074,-0.003147,0.002919,0.008993
victim_type_id,,,,,,,,,,,...,,,,,,,,,,
age_num,-0.002664,-0.00263,0.001027,,1.0,-0.033969,-0.046643,-0.046451,-0.034664,-0.083841,...,-0.011458,-0.040158,0.086221,0.028525,-0.014876,-0.00391,-0.046883,-0.17934,0.007951,-0.010345
location_id,0.002887,0.002837,0.008024,,-0.033969,1.0,-0.001015,-0.004754,0.002835,0.035748,...,-0.001393,0.003158,0.016982,-0.027986,0.003242,-0.001758,-0.060355,0.032364,-0.010329,-0.015569
population,0.010628,0.008841,-0.031618,,-0.046643,-0.001015,1.0,0.996748,0.96824,0.017775,...,0.008518,0.001207,0.020232,0.027259,0.004358,0.002975,0.051767,0.01483,-0.022238,0.001738
officers,0.014611,0.012889,-0.030453,,-0.046451,-0.004754,0.996748,1.0,0.971472,0.018113,...,0.008649,0.001217,0.019003,0.028041,0.004523,0.00289,0.051354,0.014684,-0.022332,0.002608
civilians,0.013518,0.011888,-0.031313,,-0.034664,0.002835,0.96824,0.971472,1.0,0.012256,...,0.008372,-0.005045,0.013528,0.027552,0.005202,0.003633,0.046913,0.018203,-0.018902,0.00289
victim_sex_F,-0.002744,-0.002757,-0.004555,,-0.083841,0.035748,0.017775,0.018113,0.012256,1.0,...,0.007445,0.013729,-0.063932,-0.063895,0.006537,-0.000195,-0.04846,0.097895,-0.010264,-0.008481


In [12]:
y = model_df.pop('county').values
#y = model_df.pop('fips').values
X = model_df.values

In [13]:
def standard_confusion_matrix(y_true, y_predict):
    cm = confusion_matrix(y_true, y_predict)
    cm = np.rot90(cm, k=1, axes=(0, 1))
    cm = np.flip(cm, axis=1)
    return cm

In [15]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [16]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (351947, 38)
Training Labels Shape: (351947,)
Testing Features Shape: (117316, 38)
Testing Labels Shape: (117316,)


In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, auc, roc_curve

  from numpy.core.umath_tests import inner1d


In [18]:
clf = RandomForestClassifier(oob_score=True, n_estimators=1000, max_depth=80, max_features='auto')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)

# generate confusion matrix

cm = standard_confusion_matrix(y_test,preds)

df_cm = pd.DataFrame(cm, index = ['True (positive)', 'True (negative)'])
df_cm.columns = ['Predicted (positive)', 'Predicted (negative)']

sns.heatmap(df_cm, annot=True, fmt="d");

ValueError: Target is multiclass but average='binary'. Please choose another average setting.

In [None]:
for feat, importance in zip(df.columns, clf.feature_importances_):
    print 'feature: {f}, importance: {i}'.format(f=feat, i=importance)