# Intelligent Model: Random Forest Classifier

In [None]:
import pandas as pd

### Read in query outputs to dataframes

In [None]:
crime_df = pd.read_csv('crime_query.csv')
label_df = pd.read_csv('label_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_populations.csv')

In [None]:
crime_df.head().T

### Get crime count by county and join to crime_df

In [None]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

In [None]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

In [None]:
crime_df.head().T

### Get hospital and bed counts by county and join to h_df

In [None]:
h_df.head()

In [None]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [None]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')
tmp_df

In [None]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')
tmp_df2

In [None]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')
h_df

### Get fire station count by county and join to f_df

In [None]:
f_df.head()

In [None]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')
tmp_df3

### Merge hospital dataframe and fire station dataframe

In [None]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')
hf_df

### Join to training dataframe

In [None]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [None]:
train_df.head(3).T

In [None]:
train_df.info()

### Get crime count by county for label_df

In [None]:
tmp_df5 = label_df.groupby(['county']).incident_id.agg('count')
tmp_df5 = tmp_df5.to_frame().reset_index()
tmp_df5 = tmp_df5.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df5

### Join to testing dataframe

In [None]:
label_df = label_df.join(tmp_df5.set_index('county'), on='county')

In [None]:
label_df.head().T

### Join hospital and fire station counts to testing dataframe

In [None]:
test_df = label_df.join(hf_df.set_index('county'), on='county')

In [None]:
test_df.head(3).T

In [None]:
test_df.info()

### Bring in population feature

In [None]:
pop_df.head()

In [None]:
pop_df['county'] = pop_df['county'].str.upper()

In [None]:
pop_df.head()

In [None]:
train_df =pd.merge(train_df, pop_df, how = 'outer', on=['county','year'])
train_df

In [None]:
train_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [None]:
train_df

In [None]:
train_df.info()

In [None]:
test_df =pd.merge(test_df, pop_df, how = 'outer', on=['county','year'])
test_df

In [None]:
test_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [None]:
test_df.info()

## Pass training and testing dataframes through feature engineering script

In [None]:
from Features_Script import feature_engineering

trn_df, tst_df = feature_engineering(train_df, test_df)

### Do the feature engineering manually in jupyter

In [None]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
train_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'location_id', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
train_df = pd.get_dummies(train_df, columns=['victim_sex', 'offense_category', 'location_id', 'population_description'])

In [None]:
# Drop NA in victim age column (<2%)
test_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
test_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
test_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
test_df['crime_pop_ratio'] = test_df['crime_cnt'] / test_df['population']
test_df['beds_pop_ratio'] = test_df['bed_cnt'] / test_df['population']
test_df['beds_crime_ratio'] = test_df['bed_cnt'] / test_df['crime_cnt']
test_df['fire_pop_ratio'] = test_df['fire_cnt'] / test_df['population']
test_df['fire_crime_ratio'] = test_df['fire_cnt'] / test_df['crime_cnt']

# Reduce df to only desired features to train/test model
test_df = test_df[['age_num', 'victim_sex', 'offense_category', 'location_id', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
test_df = pd.get_dummies(test_df, columns=['victim_sex', 'offense_category', 'location_id', 'population_description'])

### Confirm script and manual produce identical results

In [None]:
train_df.equals(trn_df)

In [None]:
test_df.equals(tst_df)

### Split data into training and testing sets

In [None]:
y_train = trn_df.pop('county').values
X_train = trn_df.values
y_test = tst_df.pop('county').values
X_test = tst_df.values

### Confirm shapes match

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Train Model

In [None]:
clf = RandomForestClassifier(oob_score=True, n_estimators=100, max_depth=50, max_features='auto')
clf.fit(X_train, y_train)

### Get Predictions

In [None]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

### Calculate Scores

In [None]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

oob=clf.oob_score_
acc = accuracy_score(y_test, preds)


print('Precision Scores:', precision_micro, precision_weighted)
print('Recall Scores:', recall_micro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_weighted)
print('oob Score:', oob)
print('Accuracy Score:', acc)

### Feature Importances

In [None]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(trn_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats