In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read data
df_us_data = pd.read_csv('Resources/US_data.csv')
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),Home Value $50k-$90k (%),Home Value $100k-$149k (%),Home Value $150k-$199k (%),Home Value $200k-$299k (%),Home Value $300k-$499k (%),Home Value $500k+ (%),...,Median Real Estate Taxes ($),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Net Migration Change
0,2010,3,Alabama,9.1,23.6,20.3,17.8,16.8,8.7,3.7,...,568.0,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,
1,2010,4,Alaska,2.0,2.9,9.6,15.6,35.2,26.5,8.1,...,3177.0,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,
2,2010,4,Arizona,4.0,13.4,20.5,19.5,21.4,14.4,6.8,...,1489.0,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,
3,2010,3,Arkansas,12.0,27.0,23.9,16.2,12.2,6.4,2.1,...,703.0,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,
4,2010,4,California,1.5,3.4,6.0,8.6,17.4,29.1,34.0,...,3284.0,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,74152.0,185507.0,1591788.0,710945.0,197240.0,65247.0,66085.0,...,,2576094.0,303356.0,245066.0,58290.0,11514.0,21244317,280704,267130,12.657811
464,2018,1,Massachusetts,10873.0,8759.0,294869.0,400054.0,237561.0,82421.0,70962.0,...,,900729.0,199591.0,174561.0,25030.0,5179.0,6882635,22846,11129,1.619656
465,2018,3,District of Columbia,700.0,552.0,8147.0,24676.0,22666.0,16753.0,17685.0,...,,76755.0,14236.0,11193.0,3043.0,188.0,701547,6641,2431,3.481678
466,2018,4,Utah,9409.0,5888.0,218789.0,184533.0,50940.0,13839.0,8499.0,...,,416459.0,72368.0,54194.0,18174.0,3070.0,3153550,52508,21841,6.983989


In [3]:
# Drop Real Estate Taxes and Net migration change because of NaN values
df_us_data.drop(columns=['Median Real Estate Taxes ($)', 'Net Migration Change'], inplace=True)

In [4]:
# Get list of columns for calculations
df_us_data.columns.to_list()

['Year',
 'Region',
 'NAME',
 'Home Value <$50k (%)',
 'Home Value $50k-$90k (%)',
 'Home Value $100k-$149k (%)',
 'Home Value $150k-$199k (%)',
 'Home Value $200k-$299k (%)',
 'Home Value $300k-$499k (%)',
 'Home Value $500k+ (%)',
 'Median Home Value ($)(1000X)',
 'Median Housing Cost (monthly) ($)',
 'Income <$10k (%)',
 'Income $10k-$24,999 (%)',
 'Income $25k-$34,999 (%)',
 'Income $35k-$49,999 (%)',
 'Income $50k-$74,999 (%)',
 'Income $75k-$99,999 (%)',
 'Income $100k-$149,999 (%)',
 'Income $150k+ (%)',
 'Median Income ($)(1000X)',
 'No 2nd or Equity Loan (%)',
 '2nd Mortgage or Equity Loan (%)',
 'Equity Loan Only (%)',
 '2nd Mortgage Only (%)',
 'Both 2nd & Equity Loan (%)',
 'Population',
 'Net Population Change',
 'Net Migration']

In [5]:
# Create array for the number of medium incomes needed for monthly housing cost
cost_to_income = []
for i in range(len(df_us_data)):
    cost_to_income.append(df_us_data['Median Housing Cost (monthly) ($)'][i] / (df_us_data['Median Income ($)(1000X)'][i] * 1000 / 12))
    i += 1
cost_to_income

[0.21883674391582209,
 0.22442926952832276,
 0.2600580110912397,
 0.19941743976562892,
 0.3041924833793135,
 0.24378090382920004,
 0.2631657087106832,
 0.24226671470482264,
 0.24546280299926976,
 0.290839989693378,
 0.247389653535833,
 0.3025085249333198,
 0.23368825160369464,
 0.2574740062747945,
 0.2066350710900474,
 0.20124749911733553,
 0.2103178532527973,
 0.20918093565539783,
 0.20479859123926922,
 0.2350474106491612,
 0.2464924346629986,
 0.2580072865515603,
 0.23873957367933277,
 0.2341060720126684,
 0.2200499314321882,
 0.21519928388281168,
 0.2348249746747922,
 0.20556383786672666,
 0.27351283656856606,
 0.2636283878310768,
 0.28444551128180506,
 0.230426378260939,
 0.26825490821300046,
 0.2313493838394744,
 0.18576054432922082,
 0.22452473195783404,
 0.20831473570107759,
 0.26873810673407367,
 0.2288348355764086,
 0.2761956072319046,
 0.22502270301272964,
 0.20239438477206453,
 0.22623861255906855,
 0.22451457930206178,
 0.24206762577775276,
 0.25240909487903573,
 0.23831195

In [6]:
# Create "Cost_to_income_ratio" column
df_us_data['Cost_to_income_ratio'] = cost_to_income
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),Home Value $50k-$90k (%),Home Value $100k-$149k (%),Home Value $150k-$199k (%),Home Value $200k-$299k (%),Home Value $300k-$499k (%),Home Value $500k+ (%),...,Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio
0,2010,3,Alabama,9.1,23.6,20.3,17.8,16.8,8.7,3.7,...,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837
1,2010,4,Alaska,2.0,2.9,9.6,15.6,35.2,26.5,8.1,...,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429
2,2010,4,Arizona,4.0,13.4,20.5,19.5,21.4,14.4,6.8,...,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058
3,2010,3,Arkansas,12.0,27.0,23.9,16.2,12.2,6.4,2.1,...,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417
4,2010,4,California,1.5,3.4,6.0,8.6,17.4,29.1,34.0,...,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,74152.0,185507.0,1591788.0,710945.0,197240.0,65247.0,66085.0,...,80.372,2576094.0,303356.0,245066.0,58290.0,11514.0,21244317,280704,267130,0.297118
464,2018,1,Massachusetts,10873.0,8759.0,294869.0,400054.0,237561.0,82421.0,70962.0,...,121.565,900729.0,199591.0,174561.0,25030.0,5179.0,6882635,22846,11129,0.009082
465,2018,3,District of Columbia,700.0,552.0,8147.0,24676.0,22666.0,16753.0,17685.0,...,151.147,76755.0,14236.0,11193.0,3043.0,188.0,701547,6641,2431,0.000000
466,2018,4,Utah,9409.0,5888.0,218789.0,184533.0,50940.0,13839.0,8499.0,...,93.060,416459.0,72368.0,54194.0,18174.0,3070.0,3153550,52508,21841,0.028627


In [7]:
# Create array for binary category
leave = []
for i in range(len(df_us_data)):
    if df_us_data['Cost_to_income_ratio'][i] > 0.28:
        leave.append(1)
    else:
        leave.append(0)
    i += 1
leave

[0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [8]:
# Create binary column in df_us_data
df_us_data['Leave'] = leave
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),Home Value $50k-$90k (%),Home Value $100k-$149k (%),Home Value $150k-$199k (%),Home Value $200k-$299k (%),Home Value $300k-$499k (%),Home Value $500k+ (%),...,No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio,Leave
0,2010,3,Alabama,9.1,23.6,20.3,17.8,16.8,8.7,3.7,...,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837,0
1,2010,4,Alaska,2.0,2.9,9.6,15.6,35.2,26.5,8.1,...,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429,0
2,2010,4,Arizona,4.0,13.4,20.5,19.5,21.4,14.4,6.8,...,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058,0
3,2010,3,Arkansas,12.0,27.0,23.9,16.2,12.2,6.4,2.1,...,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417,0
4,2010,4,California,1.5,3.4,6.0,8.6,17.4,29.1,34.0,...,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,74152.0,185507.0,1591788.0,710945.0,197240.0,65247.0,66085.0,...,2576094.0,303356.0,245066.0,58290.0,11514.0,21244317,280704,267130,0.297118,1
464,2018,1,Massachusetts,10873.0,8759.0,294869.0,400054.0,237561.0,82421.0,70962.0,...,900729.0,199591.0,174561.0,25030.0,5179.0,6882635,22846,11129,0.009082,0
465,2018,3,District of Columbia,700.0,552.0,8147.0,24676.0,22666.0,16753.0,17685.0,...,76755.0,14236.0,11193.0,3043.0,188.0,701547,6641,2431,0.000000,0
466,2018,4,Utah,9409.0,5888.0,218789.0,184533.0,50940.0,13839.0,8499.0,...,416459.0,72368.0,54194.0,18174.0,3070.0,3153550,52508,21841,0.028627,0


In [9]:
# Create States DataFrame with names of states
state_names_df = df_us_data[['Year', 'Region', 'NAME']]
state_names_df.head()

Unnamed: 0,Year,Region,NAME
0,2010,3,Alabama
1,2010,4,Alaska
2,2010,4,Arizona
3,2010,3,Arkansas
4,2010,4,California


In [10]:
# Remove NAME column to prepare for ML
clean_us_data_df = df_us_data.drop('NAME', axis=1)
clean_us_data_df

Unnamed: 0,Year,Region,Home Value <$50k (%),Home Value $50k-$90k (%),Home Value $100k-$149k (%),Home Value $150k-$199k (%),Home Value $200k-$299k (%),Home Value $300k-$499k (%),Home Value $500k+ (%),Median Home Value ($)(1000X),...,No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio,Leave
0,2010,3,9.1,23.6,20.3,17.8,16.8,8.7,3.7,142.7,...,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837,0
1,2010,4,2.0,2.9,9.6,15.6,35.2,26.5,8.1,255.7,...,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429,0
2,2010,4,4.0,13.4,20.5,19.5,21.4,14.4,6.8,177.0,...,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058,0
3,2010,3,12.0,27.0,23.9,16.2,12.2,6.4,2.1,122.6,...,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417,0
4,2010,4,1.5,3.4,6.0,8.6,17.4,29.1,34.0,377.7,...,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,74152.0,185507.0,1591788.0,710945.0,197240.0,65247.0,66085.0,247.0,...,2576094.0,303356.0,245066.0,58290.0,11514.0,21244317,280704,267130,0.297118,1
464,2018,1,10873.0,8759.0,294869.0,400054.0,237561.0,82421.0,70962.0,406.9,...,900729.0,199591.0,174561.0,25030.0,5179.0,6882635,22846,11129,0.009082,0
465,2018,3,700.0,552.0,8147.0,24676.0,22666.0,16753.0,17685.0,627.0,...,76755.0,14236.0,11193.0,3043.0,188.0,701547,6641,2431,0.000000,0
466,2018,4,9409.0,5888.0,218789.0,184533.0,50940.0,13839.0,8499.0,310.0,...,416459.0,72368.0,54194.0,18174.0,3070.0,3153550,52508,21841,0.028627,0


## Preprocess data

In [11]:
# Define the features set
X = clean_us_data_df.copy()
X = X.drop('Leave', axis=1)
X.head()

Unnamed: 0,Year,Region,Home Value <$50k (%),Home Value $50k-$90k (%),Home Value $100k-$149k (%),Home Value $150k-$199k (%),Home Value $200k-$299k (%),Home Value $300k-$499k (%),Home Value $500k+ (%),Median Home Value ($)(1000X),...,Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio
0,2010,3,9.1,23.6,20.3,17.8,16.8,8.7,3.7,142.7,...,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837
1,2010,4,2.0,2.9,9.6,15.6,35.2,26.5,8.1,255.7,...,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429
2,2010,4,4.0,13.4,20.5,19.5,21.4,14.4,6.8,177.0,...,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058
3,2010,3,12.0,27.0,23.9,16.2,12.2,6.4,2.1,122.6,...,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417
4,2010,4,1.5,3.4,6.0,8.6,17.4,29.1,34.0,377.7,...,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192


In [12]:
# Define the target set
y = clean_us_data_df['Leave'].ravel()
y[:5]

array([0, 0, 0, 0, 1])

In [13]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Create instance of StandardScaler
scaler = StandardScaler()
# Fit StandardScaler with training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Random Forest Model

In [15]:
# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using test data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

## Evaluate the Model

In [16]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,111,1
Actual 1,0,5


Accuracy Score: 0.9914529914529915
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       112
           1       0.83      1.00      0.91         5

    accuracy                           0.99       117
   macro avg       0.92      1.00      0.95       117
weighted avg       0.99      0.99      0.99       117



In [18]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00321254, 0.03036759, 0.03768465, 0.01330804, 0.02815471,
       0.0177358 , 0.009799  , 0.01190227, 0.01231344, 0.00944658,
       0.08222009, 0.02264092, 0.03492749, 0.02316802, 0.01889458,
       0.00870031, 0.03557545, 0.02439105, 0.02414883, 0.07189991,
       0.01840345, 0.01814369, 0.01485085, 0.02392271, 0.00991523,
       0.01917118, 0.01395869, 0.01792088, 0.34322203])

In [19]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3432220313506866, 'Cost_to_income_ratio'),
 (0.08222008720597895, 'Median Housing Cost (monthly) ($)'),
 (0.0718999147238708, 'Median Income ($)(1000X)'),
 (0.03768465391205066, 'Home Value <$50k (%)'),
 (0.03557544982404375, 'Income $75k-$99,999 (%)'),
 (0.03492749204919802, 'Income $10k-$24,999 (%)'),
 (0.030367593983386985, 'Region'),
 (0.028154707832560584, 'Home Value $100k-$149k (%)'),
 (0.024391049528613977, 'Income $100k-$149,999 (%)'),
 (0.024148826832684617, 'Income $150k+ (%)'),
 (0.02392271316614061, '2nd Mortgage Only (%)'),
 (0.023168019203198376, 'Income $25k-$34,999 (%)'),
 (0.022640915217184306, 'Income <$10k (%)'),
 (0.019171182618445987, 'Population'),
 (0.01889458051630221, 'Income $35k-$49,999 (%)'),
 (0.01840345486353251, 'No 2nd or Equity Loan (%)'),
 (0.018143690397380455, '2nd Mortgage or Equity Loan (%)'),
 (0.01792088297035991, 'Net Migration'),
 (0.017735803635723402, 'Home Value $150k-$199k (%)'),
 (0.014850847431244112, 'Equity Loan Only (%)'),
 (0.013