In [62]:
import warnings
warnings.filterwarnings('ignore')

In [63]:
# Import dependencies
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [64]:
# Read in csv dataset
final_county_df = pd.read_csv('final_county.csv')
final_county_df.head()

Unnamed: 0,County,Year,Population,GDP_pct_Change,House_Price,Income,Priced_Out
0,Adams,2010,441603,6.78,"$186,966.95",52785.0,N
1,Adams,2011,451443,15.78,"$176,663.41",52429.0,N
2,Adams,2012,459861,-0.17,"$181,639.65",55695.0,N
3,Adams,2013,469377,10.03,"$204,357.18",54876.0,N
4,Adams,2014,479488,17.6,"$228,642.09",59316.0,N


In [65]:
# Drop the dollar sign from the House_Price column
final_county_df['House_Price'] = final_county_df['House_Price'].str.replace('$', '')

In [66]:
# Check data types
final_county_df.dtypes

County             object
Year                int64
Population          int64
GDP_pct_Change    float64
House_Price        object
Income            float64
Priced_Out         object
dtype: object

In [67]:
# Strip trailing spaces from House_Price
final_county_df.House_Price = final_county_df['House_Price'].str.strip()

In [68]:
final_county_df.head()

Unnamed: 0,County,Year,Population,GDP_pct_Change,House_Price,Income,Priced_Out
0,Adams,2010,441603,6.78,186966.95,52785.0,N
1,Adams,2011,451443,15.78,176663.41,52429.0,N
2,Adams,2012,459861,-0.17,181639.65,55695.0,N
3,Adams,2013,469377,10.03,204357.18,54876.0,N
4,Adams,2014,479488,17.6,228642.09,59316.0,N


In [69]:
# Drop commas from House_Price
final_county_df['House_Price'] = final_county_df['House_Price'].str.replace(',', '')

In [70]:
final_county_df.head()

Unnamed: 0,County,Year,Population,GDP_pct_Change,House_Price,Income,Priced_Out
0,Adams,2010,441603,6.78,186966.95,52785.0,N
1,Adams,2011,451443,15.78,176663.41,52429.0,N
2,Adams,2012,459861,-0.17,181639.65,55695.0,N
3,Adams,2013,469377,10.03,204357.18,54876.0,N
4,Adams,2014,479488,17.6,228642.09,59316.0,N


In [71]:
# Convert House_Price to float
h= final_county_df['House_Price']
final_county_df.House_Price = pd.to_numeric(h)

In [72]:
# Check data types
final_county_df.dtypes

County             object
Year                int64
Population          int64
GDP_pct_Change    float64
House_Price       float64
Income            float64
Priced_Out         object
dtype: object

In [73]:
# Remove decimals
final_county_df.House_Price = final_county_df.House_Price.round()

In [85]:
# Drop null values
final_county_df.dropna(inplace=True)

In [86]:
# Check for nulls
final_county_df.isnull().sum().sum()

0

In [87]:
#Define target variable
target= 'Priced_Out'

In [88]:
# Create our features
X = final_county_df.drop(columns='Priced_Out')
X = pd.get_dummies(X)


# Create our target
y = final_county_df.loc[:, target].copy()

In [89]:
X.describe()

Unnamed: 0,Year,Population,GDP_pct_Change,House_Price,Income,County_Adams,County_Alamosa,County_Arapahoe,County_Archuleta,County_Baca,...,County_Prowers,County_Pueblo,County_Routt,County_Saguache,County_Sedgwick,County_Summit,County_Teller,County_Washington,County_Weld,County_Yuma
count,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,...,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0
mean,2013.130312,88396.844193,14.305807,208488.413598,51834.107649,0.01983,0.01983,0.01983,0.01983,0.01983,...,0.01983,0.01983,0.01983,0.008499,0.01983,0.01983,0.01983,0.01983,0.01983,0.01983
std,2.008509,164139.17961,73.083535,138400.098037,15301.482746,0.139614,0.139614,0.139614,0.139614,0.139614,...,0.139614,0.139614,0.139614,0.091925,0.139614,0.139614,0.139614,0.139614,0.139614,0.139614
min,2010.0,703.0,-38.55,33937.0,26075.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,5597.0,-5.72,99594.0,41348.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,16715.0,3.5,185859.0,47631.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,47315.0,12.73,279130.0,61286.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2016.0,694777.0,767.87,778773.0,112399.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [90]:
# Check the balance of our target values
y.value_counts()

N    231
Y    122
Name: Priced_Out, dtype: int64

In [91]:
#Split the data for training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [92]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model = brf_model.fit(X_train_scaled, y_train)


y_pred = brf_model.predict(X_test_scaled)

In [94]:
#from sklearn.metrics import accuracy_score on training set
y_pred_train = brf_model.predict(X_train_scaled)
balanced_accuracy_score(y_train, y_pred_train)


1.0

In [95]:
# Calculated the balanced accuracy score on test set
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.9833333333333334

In [96]:
# Display the confusion matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,58,2
Actual 1,0,29


In [97]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       1.00      0.97      1.00      0.98      0.98      0.96        60
          Y       0.94      1.00      0.97      0.97      0.98      0.97        29

avg / total       0.98      0.98      0.99      0.98      0.98      0.97        89



In [98]:
importances = sorted(zip(brf_model.feature_importances_, X.columns), reverse=True)
importances

[(0.15163233197104098, 'Population'),
 (0.14171603294268367, 'House_Price'),
 (0.13667112115390134, 'Income'),
 (0.08098653613465541, 'GDP_pct_Change'),
 (0.04593730958746017, 'Year'),
 (0.028471462595121535, 'County_Elbert'),
 (0.02195182386094187, 'County_Fremont'),
 (0.017813082601966037, 'County_Sedgwick'),
 (0.01703641771327847, 'County_Dolores'),
 (0.01648755656361279, 'County_Conejos'),
 (0.016464506104806147, 'County_Arapahoe'),
 (0.015463019679430591, 'County_Lake'),
 (0.0152948477825611, 'County_Eagle'),
 (0.014338951458774762, 'County_Archuleta'),
 (0.01266755773846543, 'County_Chaffee'),
 (0.012089123335639156, 'County_Montezuma'),
 (0.011823197313635105, 'County_Prowers'),
 (0.011551770407760944, 'County_Montrose'),
 (0.0111181065770286, 'County_Huerfano'),
 (0.010943012178521, 'County_Morgan'),
 (0.009160465664194227, 'County_Delta'),
 (0.009133327222793296, 'County_Yuma'),
 (0.008986600985826447, 'County_Boulder'),
 (0.008977867018174704, 'County_Logan'),
 (0.00886674290