# CLEANING DATA

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# SMOTE OVERSAMPLING

In [2]:
final_county_df = pd.read_csv('final_county.csv')
final_county_df.head()

Unnamed: 0,County,Year,population,GDP_pct_Change,House_Price,Income,Priced_Out
0,Adams,2010,441603,6.78,186967,52785,N
1,Adams,2011,451443,15.78,176663,52429,N
2,Adams,2012,459861,-0.17,181640,55695,N
3,Adams,2013,469377,10.03,204357,54876,N
4,Adams,2014,479488,17.6,228642,59316,N


In [4]:
#Define target variable
target= 'Priced_Out'

In [5]:
# Create our features
X = final_county_df.drop(columns='Priced_Out')
X = pd.get_dummies(X)


# Create our target
y = final_county_df.loc[:, target].copy()

In [6]:
X.describe()

Unnamed: 0,Year,population,GDP_pct_Change,House_Price,Income,County_Adams,County_Alamosa,County_Arapahoe,County_Archuleta,County_Baca,...,County_Prowers,County_Pueblo,County_Routt,County_Saguache,County_Sedgwick,County_Summit,County_Teller,County_Washington,County_Weld,County_Yuma
count,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,...,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0,353.0
mean,2013.130312,88396.844193,14.305807,208488.413598,51834.107649,0.01983,0.01983,0.01983,0.01983,0.01983,...,0.01983,0.01983,0.01983,0.008499,0.01983,0.01983,0.01983,0.01983,0.01983,0.01983
std,2.008509,164139.17961,73.083535,138400.098037,15301.482746,0.139614,0.139614,0.139614,0.139614,0.139614,...,0.139614,0.139614,0.139614,0.091925,0.139614,0.139614,0.139614,0.139614,0.139614,0.139614
min,2010.0,703.0,-38.55,33937.0,26075.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2011.0,5597.0,-5.72,99594.0,41348.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2013.0,16715.0,3.5,185859.0,47631.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,47315.0,12.73,279130.0,61286.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2016.0,694777.0,767.87,778773.0,112399.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
y.value_counts()

N    231
Y    122
Name: Priced_Out, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)

In [9]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({'Y': 171, 'N': 171})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5741379310344827

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
confusion_matrix(y_test, y_pred)

array([[42, 18],
       [16, 13]], dtype=int64)

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          N       0.72      0.70      0.45      0.71      0.56      0.32        60
          Y       0.42      0.45      0.70      0.43      0.56      0.31        29

avg / total       0.62      0.62      0.53      0.62      0.56      0.32        89

