In [19]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from imblearn.metrics import classification_report_imbalanced

In [3]:
# load the data
merged_df = pd.read_csv("merged_df.csv", index_col=[0],sep=',')
merged_df.head()

Unnamed: 0,GeoFips,county,pcp_count,population,pop_density,per_capita_income,pcp_per_capita,pop_density_lvl,region_far_west,region_great_lakes,...,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,pcp_per_capita_bins
0,12086,Miami-Dade,13619,2715516,552.187071,57213,5.015253,3,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6025,Imperial,236,180216,16.659991,44500,1.30954,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,40109,Oklahoma,2914,782051,425.989528,56971,3.7261,3,0,0,...,0,0,0,0,0,0,0,0,0,1
3,39115,Morgan,19,14702,13.631412,38583,1.292341,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,8059,Jefferson,1459,570427,288.167016,68829,2.557733,3,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Create features
X = merged_df.drop(columns=["pcp_per_capita", "pcp_per_capita_bins", "pop_density"])
X = pd.get_dummies(X) 

# Create target
# y = pcp_merged_df["pcp_per_capita"] #pcp_per_capita won't resample beacause it's continuous
y = merged_df["pcp_per_capita_bins"]

In [5]:
X.describe()

Unnamed: 0,GeoFips,pcp_count,population,per_capita_income,pop_density_lvl,region_far_west,region_great_lakes,region_mideast,region_new_england,region_plains,...,county_Yellowstone,county_Yoakum,county_Yolo,county_York,county_Young,county_Yuba,county_Yukon-Koyukuk,county_Yuma,county_Zapata,county_Zavala
count,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,...,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0
mean,29981.758666,339.705513,114187.0,48949.537801,1.496864,0.059756,0.144272,0.059426,0.02245,0.193793,...,0.00033,0.00033,0.00033,0.001321,0.00033,0.00033,0.00033,0.00066,0.00033,0.00033
std,15070.955336,1094.505146,423133.0,13078.053879,1.118362,0.237073,0.351423,0.236458,0.148165,0.395334,...,0.01817,0.01817,0.01817,0.036322,0.01817,0.01817,0.01817,0.025692,0.01817,0.01817
min,1001.0,1.0,459.0,22644.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18153.0,18.0,11962.0,40971.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29095.0,56.0,27216.0,46571.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,44001.0,198.0,71522.0,53791.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,56045.0,18526.0,10098050.0,220645.0,3.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Check the balance of our target values
y.value_counts()

0    1519
1    1510
Name: pcp_per_capita_bins, dtype: int64

In [7]:
# Train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

In [8]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

In [9]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = random_forest.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8218990197170547

In [10]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[309  67]
 [ 68 314]]


In [11]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.82      0.82      0.82      0.82      0.82      0.68       376
          1       0.82      0.82      0.82      0.82      0.82      0.68       382

avg / total       0.82      0.82      0.82      0.82      0.82      0.68       758



In [12]:
# List the features sorted in descending order by feature importance
feature_names = X.columns
sorted(zip(random_forest.feature_importances_, feature_names), reverse=True)

[(0.2142397079376271, 'pcp_count'),
 (0.11524292935059326, 'population'),
 (0.0892052555486632, 'per_capita_income'),
 (0.07242089673420746, 'GeoFips'),
 (0.03343027428511572, 'pop_density_lvl'),
 (0.008379741800492767, 'state_TX'),
 (0.006956660263100106, 'region_southwest'),
 (0.006873945625173298, 'region_southeast'),
 (0.004712791413332673, 'region_mideast'),
 (0.00468985946952088, 'state_CA'),
 (0.004128212638567192, 'region_plains'),
 (0.00409368198786846, 'region_new_england'),
 (0.0035283049971067976, 'region_great_lakes'),
 (0.003486603985814634, 'state_KY'),
 (0.003414996248997401, 'region_far_west'),
 (0.0033016393254762097, 'state_GA'),
 (0.0031479145441046306, 'state_MO'),
 (0.0031238072224096017, 'state_KS'),
 (0.002993150300317491, 'state_MN'),
 (0.0029694051469968947, 'state_AL'),
 (0.0029240485670440157, 'county_Union'),
 (0.0028877386756551576, 'state_OK'),
 (0.002826125904767143, 'county_Franklin'),
 (0.002790485291770404, 'state_AK'),
 (0.002784361153184162, 'state_

In [13]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
easy = EasyEnsembleClassifier(n_estimators = 100,random_state=1)
easy = easy.fit(X_train, y_train)

In [14]:
# Calculated the balanced accuracy score
y_pred = easy.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8957196167984851

In [15]:
# Display the confusion matrix
y_pred = easy.predict(X_test)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[334  42]
 [ 37 345]]


In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.89      0.90      0.89      0.90      0.80       376
          1       0.89      0.90      0.89      0.90      0.90      0.80       382

avg / total       0.90      0.90      0.90      0.90      0.90      0.80       758



In [17]:
# find r-squared score
r2_score(y_test, y_pred)

0.5830873342987635