In [1]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
# import sklearn dependencies
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from imblearn.metrics import classification_report_imbalanced

In [3]:
# load the data
merged_df = pd.read_csv("merged_df.csv", index_col=[0],sep=',')
merged_df.head()

Unnamed: 0,GeoFips,county,pcp_count,population,pop_density,per_capita_income,pcp_per_capita,pop_density_lvl,region_far_west,region_great_lakes,...,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,pcp_per_capita_bins
0,12086,Miami-Dade,13619,2715516,552.187071,57213,5.015253,3,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6025,Imperial,236,180216,16.659991,44500,1.30954,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,40109,Oklahoma,2914,782051,425.989528,56971,3.7261,3,0,0,...,0,0,0,0,0,0,0,0,0,1
3,39115,Morgan,19,14702,13.631412,38583,1.292341,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8059,Jefferson,1459,570427,288.167016,68829,2.557733,3,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Create features
X = merged_df.drop(columns=["pcp_per_capita", "pcp_per_capita_bins", "pop_density"])
X = pd.get_dummies(X) 

# Create target
y = merged_df["pcp_per_capita_bins"]

In [5]:
X.describe()

Unnamed: 0,GeoFips,pcp_count,population,per_capita_income,pop_density_lvl,region_far_west,region_great_lakes,region_mideast,region_new_england,region_plains,...,county_Yellowstone,county_Yoakum,county_Yolo,county_York,county_Young,county_Yuba,county_Yukon-Koyukuk,county_Yuma,county_Zapata,county_Zavala
count,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,...,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0
mean,29981.758666,339.705513,114187.0,48949.537801,1.496864,0.059756,0.144272,0.059426,0.02245,0.193793,...,0.00033,0.00033,0.00033,0.001321,0.00033,0.00033,0.00033,0.00066,0.00033,0.00033
std,15070.955336,1094.505146,423133.0,13078.053879,1.118362,0.237073,0.351423,0.236458,0.148165,0.395334,...,0.01817,0.01817,0.01817,0.036322,0.01817,0.01817,0.01817,0.025692,0.01817,0.01817
min,1001.0,1.0,459.0,22644.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18153.0,18.0,11962.0,40971.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29095.0,56.0,27216.0,46571.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,44001.0,198.0,71522.0,53791.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,56045.0,18526.0,10098050.0,220645.0,3.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Check the balance of our target values
y.value_counts()

0    1519
1    1510
Name: pcp_per_capita_bins, dtype: int64

In [7]:
# Train, test, split and stratify data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

In [8]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 200)
random_forest = random_forest.fit(X_train, y_train)

In [9]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8574491785018101

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[336  44]
 [ 64 314]]


In [11]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.88      0.83      0.86      0.86      0.74       380
          1       0.88      0.83      0.88      0.85      0.86      0.73       378

avg / total       0.86      0.86      0.86      0.86      0.86      0.73       758



In [12]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.22318652013517998, 'pcp_count'),
 (0.1158304219087377, 'population'),
 (0.08703033797585709, 'per_capita_income'),
 (0.07168656125944263, 'GeoFips'),
 (0.03361463041617644, 'pop_density_lvl'),
 (0.00709550305759074, 'region_southwest'),
 (0.0063764339384888855, 'state_TX'),
 (0.006273489604373163, 'region_southeast'),
 (0.004680425914652314, 'region_far_west'),
 (0.004398853013600287, 'region_mideast'),
 (0.004327209074030451, 'region_plains'),
 (0.004000921908103402, 'region_new_england'),
 (0.003628247218500141, 'state_CA'),
 (0.0035360319327123655, 'state_KY'),
 (0.003481272182852436, 'state_GA'),
 (0.0032579386927010343, 'region_great_lakes'),
 (0.003005560828379215, 'state_KS'),
 (0.0029415332399498713, 'state_VA'),
 (0.0029225994088767364, 'county_Union'),
 (0.002916000005154166, 'region_rocky_mountain'),
 (0.002766083556781753, 'state_WI'),
 (0.0025711691061288985, 'state_OK'),
 (0.002362943017545258, 'state_MO'),
 (0.002320757986237168, 'county_Wayne'),
 (0.0022743872291283

In [13]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 200,random_state=1)
easy = easy.fit(X_train, y_train)

In [14]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9182191590086326

In [15]:
# Display the confusion matrix
y_pred = easy.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[347  33]
 [ 29 349]]


In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.91      0.92      0.92      0.92      0.84       380
          1       0.91      0.92      0.91      0.92      0.92      0.84       378

avg / total       0.92      0.92      0.92      0.92      0.92      0.84       758



In [17]:
# find r-squared score
r2_score(y_test, y_pred)

0.6728209412419939