# Intelligent Model: Random Forest Classifier

In [1]:
import pandas as pd

### Read in query outputs to dataframes

In [2]:
crime_df = pd.read_csv('crime_query.csv')
label_df = pd.read_csv('label_query.csv')
h_df = pd.read_csv('hosp_query.csv')
f_df = pd.read_csv('fire_query.csv')
pop_df = pd.read_csv('census_population.csv')

In [3]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get crime count by county and join to crime_df

In [4]:
tmp_df4 = crime_df.groupby(['county']).incident_id.agg('count')
tmp_df4 = tmp_df4.to_frame().reset_index()
tmp_df4 = tmp_df4.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df4

Unnamed: 0,county,crime_cnt
0,BELL,10240
1,BEXAR,35
2,BRAZORIA,9487
3,BREWSTER,61
4,BURNET,1303
5,CALHOUN,1634
6,CAMERON,3
7,CHEROKEE,396
8,COKE,2
9,COLLIN,55442


In [5]:
crime_df = crime_df.join(tmp_df4.set_index('county'), on='county')

In [6]:
crime_df.head().T

Unnamed: 0,0,1,2,3,4
incident_id,67693343,67693344,67693345,67693346,67693348
year,2013,2013,2013,2013,2013
age_num,46,57,51,24,49
victim_sex,F,M,F,M,M
crime_against,Property,Property,Property,Property,Property
offense,All Other Larceny,Theft From Motor Vehicle,Theft From Motor Vehicle,All Other Larceny,Burglary/Breaking & Entering
offense_category,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Larceny/Theft Offenses,Burglary/Breaking & Entering
location_id,20,18,18,5,20
location_name,Residence/Home,Parking/Drop Lot/Garage,Parking/Drop Lot/Garage,Commercial/Office Building,Residence/Home
county,POTTER,POTTER,POTTER,POTTER,POTTER


### Get hospital and bed counts by county and join to h_df

In [7]:
h_df.head()

Unnamed: 0,county,hosp_id,beds
0,HARRIS,25577030,1082.0
1,JONES,3379501,45.0
2,COLLIN,475013,73.0
3,POTTER,679106,451.0
4,BRAZORIA,1177515,64.0


In [8]:
h_df['beds'] = h_df['beds'].clip_lower(1)

In [9]:
tmp_df = h_df.groupby(['county']).hosp_id.agg('count')
tmp_df = tmp_df.to_frame().reset_index()
tmp_df = tmp_df.rename({'hosp_id': 'hosp_cnt'}, axis='columns')
tmp_df

Unnamed: 0,county,hosp_cnt
0,ANDERSON,1
1,ANDREWS,1
2,ANGELINA,2
3,ATASCOSA,1
4,AUSTIN,1
5,BAILEY,1
6,BASTROP,1
7,BAYLOR,1
8,BEE,1
9,BELL,7


In [10]:
tmp_df2 = h_df.groupby(['county']).beds.agg('sum')
tmp_df2 = tmp_df2.to_frame().reset_index()
tmp_df2 = tmp_df2.rename({'beds': 'bed_cnt'}, axis='columns')
tmp_df2

Unnamed: 0,county,bed_cnt
0,ANDERSON,86.0
1,ANDREWS,34.0
2,ANGELINA,420.0
3,ATASCOSA,67.0
4,AUSTIN,32.0
5,BAILEY,25.0
6,BASTROP,8.0
7,BAYLOR,49.0
8,BEE,69.0
9,BELL,896.0


In [11]:
h_df = tmp_df.join(tmp_df2.set_index('county'), on='county')
h_df

Unnamed: 0,county,hosp_cnt,bed_cnt
0,ANDERSON,1,86.0
1,ANDREWS,1,34.0
2,ANGELINA,2,420.0
3,ATASCOSA,1,67.0
4,AUSTIN,1,32.0
5,BAILEY,1,25.0
6,BASTROP,1,8.0
7,BAYLOR,1,49.0
8,BEE,1,69.0
9,BELL,7,896.0


### Get fire station count by county and join to f_df

In [12]:
f_df.head()

Unnamed: 0,county,fire_id
0,EL PASO,10410064
1,EL PASO,10410225
2,EL PASO,10139838
3,EL PASO,10139423
4,EL PASO,10410065


In [13]:
tmp_df3 = f_df.groupby(['county']).fire_id.agg('count')
tmp_df3 = tmp_df3.to_frame().reset_index()
tmp_df3 = tmp_df3.rename({'fire_id': 'fire_cnt'}, axis='columns')
tmp_df3

Unnamed: 0,county,fire_cnt
0,ANDERSON,21
1,ANDREWS,1
2,ANGELINA,18
3,ARANSAS,5
4,ARCHER,7
5,ARMSTRONG,2
6,ATASCOSA,2
7,AUSTIN,5
8,BAILEY,1
9,BANDERA,6


### Merge hospital dataframe and fire station dataframe

In [14]:
hf_df = tmp_df3.join(h_df.set_index('county'), on='county')
hf_df

Unnamed: 0,county,fire_cnt,hosp_cnt,bed_cnt
0,ANDERSON,21,1.0,86.0
1,ANDREWS,1,1.0,34.0
2,ANGELINA,18,2.0,420.0
3,ARANSAS,5,,
4,ARCHER,7,,
5,ARMSTRONG,2,,
6,ATASCOSA,2,1.0,67.0
7,AUSTIN,5,1.0,32.0
8,BAILEY,1,1.0,25.0
9,BANDERA,6,,


### Join to training dataframe

In [15]:
train_df = crime_df.join(hf_df.set_index('county'), on='county')

In [16]:
train_df[345]

KeyError: 345

In [None]:
train_df.info()

### Get crime count by county for label_df

In [None]:
tmp_df5 = label_df.groupby(['county']).incident_id.agg('count')
tmp_df5 = tmp_df5.to_frame().reset_index()
tmp_df5 = tmp_df5.rename({'incident_id': 'crime_cnt'}, axis='columns')
tmp_df5

### Join to testing dataframe

In [None]:
label_df = label_df.join(tmp_df5.set_index('county'), on='county')

In [None]:
label_df.head().T

### Join hospital and fire station counts to testing dataframe

In [None]:
test_df = label_df.join(hf_df.set_index('county'), on='county')

In [None]:
test_df.head(3).T

In [None]:
test_df.info()

### Bring in population feature

In [None]:
pop_df.head()

In [None]:
pop_df['county'] = pop_df['county'].str.upper()
pop_df.drop_duplicates(['county','year'], inplace=True)

In [None]:
pop_df.head()

In [None]:
pop_df['population_description'] = pd.cut(pop_df['population'], [0, 10000, 25000, 100000, 500000], 
                                   labels=['Under 10,000', '10,000 - 24,999', '25,000 - 99,999','Over 100,000'])


In [None]:
pop_df.head(10)

In [None]:
pop_df['population_description'].value_counts()

In [None]:
train_df =pd.merge(train_df, pop_df, how = 'outer', on=['county','year'])
train_df

In [None]:
train_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [None]:
train_df

In [None]:
train_df.info()

In [None]:
test_df =pd.merge(test_df, pop_df, how = 'outer', on=['county','year'])
test_df

In [None]:
test_df.dropna(axis=0, subset=['incident_id'], inplace=True)

In [None]:
test_df.info()

## Pass training and testing dataframes through feature engineering script

In [None]:
from Scripts.Features_Script import feature_engineering

trn_df, tst_df = feature_engineering(train_df, test_df)

In [None]:
train_df.offense_category.value_counts()

In [None]:
test_df.offense_category.value_counts()

### Do the feature engineering manually in jupyter

In [None]:
# Drop NA in victim age column (<2%)
train_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
train_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
train_df.dropna(axis=0, subset=['population'], inplace=True)

# Drop offense_category_Gambling Offenses
train_df = train_df[train_df.offense_category != 'Gambling Offenses']

# Create Ratio Columns
train_df['crime_pop_ratio'] = train_df['crime_cnt'] / train_df['population']
train_df['beds_pop_ratio'] = train_df['bed_cnt'] / train_df['population']
train_df['beds_crime_ratio'] = train_df['bed_cnt'] / train_df['crime_cnt']
train_df['fire_pop_ratio'] = train_df['fire_cnt'] / train_df['population']
train_df['fire_crime_ratio'] = train_df['fire_cnt'] / train_df['crime_cnt']

# Reduce df to only desired features to train/test model
train_df = train_df[['age_num', 'victim_sex', 'offense_category', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
train_df = pd.get_dummies(train_df, columns=['victim_sex', 'offense_category', 'population_description'])

In [None]:
# Drop NA in victim age column (<2%)
test_df.dropna(axis=0, subset=['age_num'], inplace=True)

# Drop NA in hospital_cnt column (<2%)
test_df.dropna(axis=0, subset=['hosp_cnt'], inplace=True)

# Drop NA in population column (<2%)
test_df.dropna(axis=0, subset=['population'], inplace=True)

# Create Ratio Columns
test_df['crime_pop_ratio'] = test_df['crime_cnt'] / test_df['population']
test_df['beds_pop_ratio'] = test_df['bed_cnt'] / test_df['population']
test_df['beds_crime_ratio'] = test_df['bed_cnt'] / test_df['crime_cnt']
test_df['fire_pop_ratio'] = test_df['fire_cnt'] / test_df['population']
test_df['fire_crime_ratio'] = test_df['fire_cnt'] / test_df['crime_cnt']

# Reduce df to only desired features to train/test model
test_df = test_df[['age_num', 'victim_sex', 'offense_category', 'population_description',
                     'officers', 'civilians', 'crime_pop_ratio', 'beds_pop_ratio', 'beds_crime_ratio',
                     'fire_pop_ratio', 'fire_crime_ratio', 'county']]

# Dummize features
test_df = pd.get_dummies(test_df, columns=['victim_sex', 'offense_category', 'population_description'])

### Confirm script and manual produce identical results

In [None]:
train_df.equals(trn_df)

In [None]:
test_df.equals(tst_df)

### Split data into training and testing sets

In [None]:
y_train = trn_df.pop('county').values
X_train = trn_df.values
y_test = tst_df.pop('county').values
X_test = tst_df.values

### Confirm shapes match

In [None]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

### Train Model

In [None]:
clf = RandomForestClassifier(oob_score=True, n_estimators=100, max_depth=30, max_features='auto')
clf.fit(X_train, y_train)

### Get Predictions

In [None]:
preds = clf.predict(X_test)
proba_preds = clf.predict_proba(X_test)

### Calculate Scores

In [None]:
# calculate scores
precision_micro = precision_score(y_test, preds, average = 'micro')
precision_weighted = precision_score(y_test, preds, average = 'weighted')

recall_micro = recall_score(y_test, preds, average = 'micro')
recall_weighted = recall_score(y_test, preds, average = 'weighted')

F_1_micro = 2 / ((1/precision_micro) + (1/recall_micro))
F_1_weighted = 2 / ((1/precision_weighted) + (1/recall_weighted))

score = clf.score(X_test, y_test)
oob=clf.oob_score_
acc = accuracy_score(y_test, preds)


print('Precision Scores:', precision_micro, precision_weighted)
print('Recall Scores:', recall_micro, recall_weighted)
print('F 1 Scores:', F_1_micro, F_1_weighted)
print('Score:', score)
print('oob Score:', oob)
print('Accuracy Score:', acc)

### Feature Importances

In [None]:
import operator
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(trn_df.columns, clf.feature_importances_):
    feats[feature] = importance
feats = sorted(feats.items(), key = operator.itemgetter(1), reverse=True)
feats

In [None]:
proba_preds[3]

In [None]:
preds[3]

In [None]:
y_test[3]

In [None]:
print(preds[0])
print(proba_preds[0])
print(clf.classes_)

In [None]:
results_df = pd.DataFrame(proba_preds, columns = clf.classes_)

In [None]:
results_df.head()

In [None]:
results_df['Predictions'] = preds

In [None]:
results_df.head().T

In [None]:
results_df['Actual'] = y_test

In [None]:
results_df.head(10).T

In [None]:
results_df.to_csv('Intelligent_Model_Results.csv')