In [1]:
import numpy as np
import pandas as pd
import psycopg2

In [2]:
import json

with open('config.json') as f:
    conf = json.load(f)
    host = conf['host']
    database = conf['database']
    user = conf['user']
    passw = conf['passw']

In [3]:
conn_str = "host={} dbname={} user={} password={}".format(host, database, user, passw)
conn = psycopg2.connect(conn_str)

In [4]:
train_query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2013-01-01' AND '2015-12-31';
'''

In [5]:
train_df = pd.read_sql(train_query, con=conn)
train_df.head(3).T

Unnamed: 0,0,1,2
incident_id,67693343,67693344,67693345
victim_id,73595250,73595251,73595252
date_part,2013,2013,2013
victim_type_id,4,4,4
victim_type,Individual,Individual,Individual
age_num,46,57,51
age_range_low,,,
age_range_high,,,
age_id,5,5,5
age_code,AG,AG,AG


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 468086 entries, 0 to 468085
Data columns (total 25 columns):
incident_id               468086 non-null int64
victim_id                 468086 non-null int64
date_part                 468086 non-null float64
victim_type_id            468086 non-null int64
victim_type               468086 non-null object
age_num                   458866 non-null float64
age_range_low             23687 non-null float64
age_range_high            23687 non-null float64
age_id                    468086 non-null int64
age_code                  468086 non-null object
age_name                  468086 non-null object
victim_sex                468086 non-null object
crime_against             468086 non-null object
offense                   468086 non-null object
offense_category          468086 non-null object
offense_group             468086 non-null object
location_id               468086 non-null int64
location_name             468086 non-null object
fips      

In [7]:
test_query = '''
SELECT      DISTINCT inc.incident_id AS INCIDENT_ID,
            vic.victim_id AS VICTIM_ID,
            date_part('year',inc.incident_date),
            vic.victim_type_id AS VICTIM_TYPE_ID,
            ty.victim_type_name AS VICTIM_TYPE,
            vic.age_num,
            vic.age_range_low_num AS AGE_RANGE_LOW,
            vic.age_range_high_num AS AGE_RANGE_HIGH,
            vic.age_id,
            age.age_code,
            age.age_name,
            vic.sex_code AS VICTIM_SEX,
            oft.crime_against AS CRIME_AGAINST,
            oft.offense_name AS OFFENSE,
            oft.offense_category_name AS OFFENSE_CATEGORY,
            oft.offense_group AS OFFENSE_GROUP,
            off.location_id AS LOCATION_ID,
            loc.location_name AS LOCATION_NAME,
            ori.fips AS FIPS,
            ori.countyname AS COUNTY,
            ori.name AS ORI_NAME,
            ags.population AS POPULATION,
            ags.population_group_desc AS POPULATION_DESCRIPTION,
            ags.total_officers AS OFFICERS,
            ags.total_civilians AS CIVILIANS
            
FROM        nibrs_victim as vic
JOIN        nibrs_victim_type as ty
ON          vic.victim_type_id = ty.victim_type_id
JOIN        nibrs_age as age
ON          age.age_id = vic.age_id
JOIN        nibrs_offense as off
ON          off.incident_id = vic.incident_id
JOIN        nibrs_offense_type as oft
ON          oft.offense_type_id = off.offense_type_id
JOIN        nibrs_location_type as loc
ON          off.location_id = loc.location_id
JOIN        nibrs_incident as inc
ON          inc.incident_id = vic.incident_id
JOIN        cde_agencies as ags
ON          ags.agency_id = inc.agency_id
JOIN        ori_to_fips as ori
ON          ori.ori9 = ags.ori

WHERE       vic.victim_type_id = 4
AND         inc.incident_date BETWEEN '2016-01-01' AND '2016-12-31';
'''

In [8]:
test_df = pd.read_sql(test_query, con=conn)
test_df.head(3).T

Unnamed: 0,0,1,2
incident_id,87264241,87264244,87264248
victim_id,95370984,95370986,95370990
date_part,2016,2016,2016
victim_type_id,4,4,4
victim_type,Individual,Individual,Individual
age_num,59,50,26
age_range_low,59,50,26
age_range_high,0,0,0
age_id,5,5,5
age_code,AG,AG,AG


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 164010 entries, 0 to 164009
Data columns (total 25 columns):
incident_id               164010 non-null int64
victim_id                 164010 non-null int64
date_part                 164010 non-null float64
victim_type_id            164010 non-null int64
victim_type               164010 non-null object
age_num                   160948 non-null float64
age_range_low             160948 non-null float64
age_range_high            160948 non-null float64
age_id                    164010 non-null int64
age_code                  164010 non-null object
age_name                  164010 non-null object
victim_sex                164010 non-null object
crime_against             164010 non-null object
offense                   164010 non-null object
offense_category          164010 non-null object
offense_group             164010 non-null object
location_id               164010 non-null int64
location_name             164010 non-null object
fips    

### Random Forest Regressor

In [10]:
train_model_df = train_df[['age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

train_model_df = pd.get_dummies(train_model_df, columns = ['victim_sex','crime_against','county','location_id'])
train_model_df.head()

Unnamed: 0,age_num,offense_category,population,officers,civilians,victim_sex_F,victim_sex_M,victim_sex_U,crime_against_Not a Crime,crime_against_Person,...,location_id_37,location_id_38,location_id_39,location_id_40,location_id_41,location_id_42,location_id_43,location_id_44,location_id_45,location_id_46
0,46.0,Larceny/Theft Offenses,198770,343,68,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,57.0,Larceny/Theft Offenses,198770,343,68,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,51.0,Larceny/Theft Offenses,198770,343,68,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24.0,Larceny/Theft Offenses,198770,343,68,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,Burglary/Breaking & Entering,198770,343,68,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
test_model_df = test_df[['age_num','victim_sex','crime_against','offense_category','location_id','population','officers','civilians','county']]

test_model_df = pd.get_dummies(test_model_df, columns = ['victim_sex','crime_against','county','location_id'])
test_model_df.head()

Unnamed: 0,age_num,offense_category,population,officers,civilians,victim_sex_F,victim_sex_M,victim_sex_U,crime_against_Not a Crime,crime_against_Person,...,location_id_38,location_id_39,location_id_40,location_id_41,location_id_42,location_id_43,location_id_44,location_id_45,location_id_46,location_id_47
0,59.0,Larceny/Theft Offenses,6212,19,13,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,50.0,Fraud Offenses,6212,19,13,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,26.0,Larceny/Theft Offenses,6212,19,13,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,25.0,Assault Offenses,645,2,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,26.0,Larceny/Theft Offenses,645,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Clean Training and Test Data: Drop missing age rows (<1% of data)

In [12]:
train_model_df.dropna(axis=0, subset=['age_num'],inplace=True)
test_model_df.dropna(axis=0, subset=['age_num'],inplace=True)

In [13]:
train_model_df.drop(['county_CHEROKEE','county_COKE','county_GRAYSON','county_LLANO','county_NOLAN'], axis=1, inplace=True)

In [14]:
test_model_df.drop(['location_id_47','county_CORYELL','county_HUTCHINSON','county_JEFFERSON','county_KENDALL','county_LAMAR','county_LAMPASAS','county_MADISON','county_OCHILTREE','county_RANDALL','county_SAN PATRICIO','county_SHELBY','county_STARR','county_WALKER','county_WISE','county_YOUNG'], axis=1, inplace=True)

In [15]:
# Training and Testing Sets
y_train = train_model_df.pop('offense_category').values
X_train = train_model_df.values
y_test = test_model_df.pop('offense_category').values
X_test = test_model_df.values

In [16]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (458866, 100)
Training Labels Shape: (458866,)
Testing Features Shape: (160948, 100)
Testing Labels Shape: (160948,)


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, auc, roc_curve, precision_recall_curve, precision_recall_fscore_support

  from numpy.core.umath_tests import inner1d


In [19]:
clf = RandomForestClassifier(oob_score=True, n_estimators=500, max_depth=70, max_features='auto')
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=70, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [20]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

In [21]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_macro = precision_score(y_test, preds, average = 'macro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_macro = recall_score(y_test, preds, average = 'macro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_macro = 2 / ((1/precision_macro) + (1/recall_macro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)

prfs_micro = precision_recall_fscore_support(y_test, preds, average='micro')
prfs_macro = precision_recall_fscore_support(y_test, preds, average='macro')
prfs_weighted = precision_recall_fscore_support(y_test, preds, average='weighted')

#Prec, Rec, Thresh = precision_recall_curve(y_test, proba_preds)

print('Precision Scores:', precision_micro, precision_macro, precision_weighted)
print('Recall Scores:', recall_micro, recall_macro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_macro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)
print('PRF_Support Scores:', prfs_micro, prfs_macro, prfs_weighted)
#print(Prec, Rec, Thresh)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Precision Scores: 0.5951425305067475 0.2578632329007785 0.5424222423640166
Recall Scores: 0.5951425305067475 0.20074419982747257 0.5951425305067475
F 1 Scores: 0.5951425305067475 0.22574666112864852 0.5675607290633634
oob Score: 0.5864282818949322
Accuracy Score: 0.5951425305067475
PRF_Support Scores: (0.5951425305067475, 0.5951425305067475, 0.5951425305067475, None) (0.2578632329007785, 0.20074419982747257, 0.2088025255615289, None) (0.5424222423640166, 0.5951425305067475, 0.548087694438484, None)


In [None]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(train_model_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats

In [None]:
for col in train_model_df.columns:
    print(col)

In [None]:
for col in test_model_df.columns:
    print(col)

In [None]:
test_model_df.drop(['location_id_47','county_CORYELL','county_MADISON','county_OCHILTREE','county_RANDALL','county_SAN PATRICIO','county_SHELBY','county_STARR','county_WALKER','county_WISE','county_YOUNG'])

In [22]:
proba_preds

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.08234184, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])