In [1]:
import numpy as np
import pandas as pd
import psycopg2

In [2]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

In [4]:
query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2014-01-01' AND '2016-12-31';
'''

In [5]:
df = pd.read_sql(query, con=conn)
df.head(3).T

Unnamed: 0,0,1,2
incident_id,73180647,73180648,73180649
victim_id,79599709,79599710,79599711
date_part,2014,2014,2014
victim_type_id,4,4,4
victim_type,Individual,Individual,Individual
age_num,35,60,38
age_range_low,,,
age_range_high,,,
age_id,5,5,5
age_code,AG,AG,AG


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 25 columns):
incident_id               478607 non-null int64
victim_id                 478607 non-null int64
date_part                 478607 non-null float64
victim_type_id            478607 non-null int64
victim_type               478607 non-null object
age_num                   469263 non-null float64
age_range_low             184635 non-null float64
age_range_high            184635 non-null float64
age_id                    478607 non-null int64
age_code                  478607 non-null object
age_name                  478607 non-null object
victim_sex                478607 non-null object
crime_against             478607 non-null object
offense                   478607 non-null object
offense_category          478607 non-null object
offense_group             478607 non-null object
location_id               478607 non-null int64
location_name             478607 non-null object
fips    

### Random Forest Regressor

In [7]:
model_df = df[['age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

model_df = pd.get_dummies(model_df, columns = ['victim_sex','crime_against','offense_category','location_id'])
model_df.head()

Unnamed: 0,date_part,age_num,population,officers,civilians,county,victim_sex_F,victim_sex_M,victim_sex_U,crime_against_Not a Crime,...,location_id_38,location_id_39,location_id_40,location_id_41,location_id_42,location_id_43,location_id_44,location_id_45,location_id_46,location_id_47
0,2014.0,35.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2014.0,60.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2014.0,38.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2014.0,66.0,71780,144,34,BELL,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2014.0,57.0,71780,144,34,BELL,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 478607 entries, 0 to 478606
Data columns (total 81 columns):
date_part                                                    478607 non-null float64
age_num                                                      469263 non-null float64
population                                                   478607 non-null int64
officers                                                     478607 non-null int64
civilians                                                    478607 non-null int64
county                                                       478607 non-null object
victim_sex_F                                                 478607 non-null uint8
victim_sex_M                                                 478607 non-null uint8
victim_sex_U                                                 478607 non-null uint8
crime_against_Not a Crime                                    478607 non-null uint8
crime_against_Person                                  

### Drop missing age rows (<2% of data)

In [9]:
model_df.dropna(axis=0, subset=['age_num'],inplace=True)

In [10]:
model_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 469263 entries, 0 to 478606
Data columns (total 81 columns):
date_part                                                    469263 non-null float64
age_num                                                      469263 non-null float64
population                                                   469263 non-null int64
officers                                                     469263 non-null int64
civilians                                                    469263 non-null int64
county                                                       469263 non-null object
victim_sex_F                                                 469263 non-null uint8
victim_sex_M                                                 469263 non-null uint8
victim_sex_U                                                 469263 non-null uint8
crime_against_Not a Crime                                    469263 non-null uint8
crime_against_Person                                  

In [11]:
model_df.corr()

Unnamed: 0,date_part,age_num,population,officers,civilians,victim_sex_F,victim_sex_M,victim_sex_U,crime_against_Not a Crime,crime_against_Person,...,location_id_38,location_id_39,location_id_40,location_id_41,location_id_42,location_id_43,location_id_44,location_id_45,location_id_46,location_id_47
date_part,1.000000,0.001027,-0.031618,-0.030453,-0.031313,-0.004555,0.004253,0.004617,-0.001154,0.013002,...,0.008388,0.000088,0.001433,0.005636,0.001978,-0.003817,-0.000730,0.004610,0.000781,0.001752
age_num,0.001027,1.000000,-0.046643,-0.046451,-0.034664,-0.083841,0.083415,0.006572,-0.001317,-0.266340,...,-0.041867,0.001301,-0.026239,-0.100867,0.007214,-0.006908,0.001600,-0.007477,0.024696,0.002337
population,-0.031618,-0.046643,1.000000,0.996748,0.968240,0.017775,-0.017049,-0.011101,0.002023,-0.020567,...,0.002179,-0.001064,-0.000524,0.024400,0.023073,0.004359,-0.001958,0.005825,-0.016822,-0.000182
officers,-0.030453,-0.046451,0.996748,1.000000,0.971472,0.018113,-0.017395,-0.010973,0.002085,-0.016984,...,0.001399,-0.000838,0.000245,0.022004,0.023279,0.000792,-0.002008,0.005457,-0.017774,-0.000538
civilians,-0.031313,-0.034664,0.968240,0.971472,1.000000,0.012256,-0.011566,-0.010530,0.001694,-0.013697,...,-0.001078,-0.001145,0.000116,0.024315,0.020988,0.003973,-0.002207,0.006122,-0.018854,-0.000098
victim_sex_F,-0.004555,-0.083841,0.017775,0.018113,0.012256,1.000000,-0.997852,-0.033783,-0.005423,0.130763,...,-0.001384,0.000828,0.001960,0.002462,-0.003950,0.007683,0.001330,-0.003329,0.016304,-0.001504
victim_sex_M,0.004253,0.083415,-0.017049,-0.017395,-0.011566,-0.997852,1.000000,-0.031757,0.005435,-0.130034,...,0.001494,-0.000811,-0.001893,-0.002342,0.004019,-0.007662,-0.001320,0.003388,-0.016306,0.001507
victim_sex_U,0.004617,0.006572,-0.011101,-0.010973,-0.010530,-0.033783,-0.031757,1.000000,-0.000173,-0.011259,...,-0.001679,-0.000262,-0.001018,-0.001836,-0.001053,-0.000327,-0.000144,-0.000888,0.000011,-0.000048
crime_against_Not a Crime,-0.001154,-0.001317,0.002023,0.002085,0.001694,-0.005423,0.005435,-0.000173,1.000000,-0.003371,...,-0.000406,-0.000042,-0.000163,-0.000582,-0.000169,-0.000262,-0.000023,-0.000143,-0.000470,-0.000008
crime_against_Person,0.013002,-0.266340,-0.020567,-0.016984,-0.013697,0.130763,-0.130034,-0.011259,-0.003371,1.000000,...,0.021576,-0.001600,0.007038,0.051680,0.015293,-0.015901,-0.000662,-0.003131,0.009054,-0.000935


In [12]:
y = model_df.pop('county').values
#y = model_df.pop('fips').values
X = model_df.values

In [13]:
# def standard_confusion_matrix(y_true, y_predict):
#     cm = confusion_matrix(y_true, y_predict)
#     cm = np.rot90(cm, k=1, axes=(0, 1))
#     cm = np.flip(cm, axis=1)
#     return cm

In [14]:
# Training and Testing Sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [15]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (351947, 80)
Training Labels Shape: (351947,)
Testing Features Shape: (117316, 80)
Testing Labels Shape: (117316,)


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, auc, roc_curve

  from numpy.core.umath_tests import inner1d


In [17]:
clf = RandomForestClassifier(oob_score=True, n_estimators=500, max_depth=50, max_features='auto')
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)

Precision Scores: 0.9907173787036722 0.8563964179681841 0.9905024258121831
Recall Scores: 0.9907173787036722 0.7794630763686706 0.9907173787036722
F 1 Scores: 0.9907173787036722 0.8161206862221306 0.9906098905972462
oob Score: 0.9904502666594686
Accuracy Score: 0.9907173787036722


ValueError: Shape of passed values is (61, 61), indices imply (61, 2)

In [20]:
for feat, importance in zip(model_df.columns, clf.feature_importances_):
    print('feature: {f}, importance: {i}'.format(f=feat, i=importance))

feature: date_part, importance: 0.005239129726177896
feature: age_num, importance: 0.021228963604340997
feature: population, importance: 0.3253757770607214
feature: officers, importance: 0.3191835506399411
feature: civilians, importance: 0.30313051238645106
feature: victim_sex_F, importance: 0.001223192808886128
feature: victim_sex_M, importance: 0.001227826692347265
feature: victim_sex_U, importance: 0.00018103362438384382
feature: crime_against_Not a Crime, importance: 2.518491870969487e-06
feature: crime_against_Person, importance: 0.00038697757008600205
feature: crime_against_Property, importance: 0.0004748641120066831
feature: crime_against_Society, importance: 0.00017430541434127432
feature: offense_category_Arson, importance: 8.904338141191497e-05
feature: offense_category_Assault Offenses, importance: 0.00047476444256096164
feature: offense_category_Bribery, importance: 7.6061735644369135e-06
feature: offense_category_Burglary/Breaking & Entering, importance: 0.0006634708965841