In [1]:
# import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from imblearn.metrics import classification_report_imbalanced

In [3]:
# load the data
merged_df = pd.read_csv("merged_df.csv", index_col=[0],sep=',')
merged_df.head()

Unnamed: 0,GeoFips,county,pcp_count,population,pop_density,per_capita_income,pcp_per_capita,pop_density_lvl,region_Far West,region_Great Lakes,...,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,pcp_per_capita_bins
0,12086,Miami-Dade,13619,2715516,552.187071,57213,5.015253,3,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6025,Imperial,236,180216,16.659991,44500,1.30954,1,1,0,...,0,0,0,0,0,0,0,0,0,1
2,40109,Oklahoma,2914,782051,425.989528,56971,3.7261,3,0,0,...,0,0,0,0,0,0,0,0,0,1
3,39115,Morgan,19,14702,13.631412,38583,1.292341,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,8059,Jefferson,1459,570427,288.167016,68829,2.557733,3,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Create features
X = merged_df.drop(columns=["pcp_per_capita", "pcp_per_capita_bins", "pop_density", "per_capita_income", "state_AK", "state_AL", "state_AR", "state_AZ", "state_CA", "state_CO",
"state_CT", "state_DC", "state_DE", "state_FL", "state_GA", "state_HI",
"state_IA", "state_ID", "state_IL", "state_IN", "state_KS", "state_KY",
"state_LA", "state_MA", "state_MD", "state_ME", "state_MI", "state_MN",
"state_MO", "state_MS", "state_MT", "state_NC", "state_ND", "state_NE",
"state_NH", "state_NJ", "state_NM", "state_NV", "state_NY", "state_OH",
"state_OK", "state_OR", "state_PA", "state_RI", "state_SC", "state_SD",
"state_TN", "state_TX", "state_UT", "state_VA", "state_VT", "state_WA",
"state_WI", "state_WV", "state_WY"])
X = pd.get_dummies(X) 

# Create target
y = merged_df["pcp_per_capita_bins"]

In [5]:
X.describe()

Unnamed: 0,GeoFips,pcp_count,population,pop_density_lvl,region_Far West,region_Great Lakes,region_Mideast,region_New England,region_Plains,region_Rocky Mountain,...,county_Yellowstone,county_Yoakum,county_Yolo,county_York,county_Young,county_Yuba,county_Yukon-Koyukuk,county_Yuma,county_Zapata,county_Zavala
count,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,...,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0,3029.0
mean,29981.758666,339.705513,114187.0,1.496864,0.059756,0.144272,0.059426,0.02245,0.193793,0.067019,...,0.00033,0.00033,0.00033,0.001321,0.00033,0.00033,0.00033,0.00066,0.00033,0.00033
std,15070.955336,1094.505146,423133.0,1.118362,0.237073,0.351423,0.236458,0.148165,0.395334,0.250096,...,0.01817,0.01817,0.01817,0.036322,0.01817,0.01817,0.01817,0.025692,0.01817,0.01817
min,1001.0,1.0,459.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18153.0,18.0,11962.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29095.0,56.0,27216.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,44001.0,198.0,71522.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,56045.0,18526.0,10098050.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
# Check the balance of our target values
y.value_counts()

1    2881
0     148
Name: pcp_per_capita_bins, dtype: int64

In [7]:
# Train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

## Combination Sampling with SMOTEENN

In [8]:
# import SMOTEENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

In [9]:
# import logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [11]:
# print confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 34   0]
 [  2 722]]


In [12]:
# print balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.9986187845303867

In [13]:
# print imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      1.00      1.00      0.97      1.00      1.00        34
          1       1.00      1.00      1.00      1.00      1.00      1.00       724

avg / total       1.00      1.00      1.00      1.00      1.00      1.00       758



In [14]:
# find r-squared score
r2_score(y_test, y_pred)

0.9384140396490088