In [1]:
# Initial imports.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read data
df_us_data = pd.read_csv('Resources/US_data_combined.csv')
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$149,999k (%)","Home Value $150k-$199,999k (%)","Home Value $200k-$299,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",...,Median Real Estate Taxes ($),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Net Migration Change
0,2010,3,Alabama,9.1,23.6,20.3,17.8,16.8,61.7,8.7,...,568.0,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,
1,2010,4,Alaska,2.0,2.9,9.6,15.6,35.2,28.1,26.5,...,3177.0,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,
2,2010,4,Arizona,4.0,13.4,20.5,19.5,21.4,53.4,14.4,...,1489.0,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,
3,2010,3,Arkansas,12.0,27.0,23.9,16.2,12.2,67.1,6.4,...,703.0,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,
4,2010,4,California,1.5,3.4,6.0,8.6,17.4,18.0,29.1,...,3284.0,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,2.6,6.4,,,,55.1,24.6,...,2151.0,89.1,10.5,8.5,2.0,0.4,21244317,280704,267130,12.657811
464,2018,1,Massachusetts,1.0,0.8,,,,26.7,36.2,...,4801.0,81.5,18.1,15.8,2.3,0.5,6882635,22846,11129,1.619656
465,2018,3,District of Columbia,0.8,0.6,,,,8.9,27.1,...,3691.0,84.2,15.6,12.3,3.3,0.2,701547,6641,2431,3.481678
466,2018,4,Utah,1.9,1.2,,,,44.5,37.5,...,1788.0,84.7,14.7,11.0,3.7,0.6,3153550,52508,21841,6.983989


In [3]:
# Drop columns with multiple NaN values
df_us_data.dropna(axis=1, inplace=True)
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),...,Income $150k+ (%),Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration
0,2010,3,Alabama,9.1,23.6,61.7,8.7,3.7,142.7,1130,...,8.7,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168
1,2010,4,Alaska,2.0,2.9,28.1,26.5,8.1,255.7,1772,...,19.1,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598
2,2010,4,Arizona,4.0,13.4,53.4,14.4,6.8,177.0,1442,...,10.2,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672
3,2010,3,Arkansas,12.0,27.0,67.1,6.4,2.1,122.6,987,...,7.1,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270
4,2010,4,California,1.5,3.4,18.0,29.1,34.0,377.7,2242,...,21.2,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,2.6,6.4,55.1,24.6,11.4,247.0,1471,...,18.4,80.372,89.1,10.5,8.5,2.0,0.4,21244317,280704,267130
464,2018,1,Massachusetts,1.0,0.8,26.7,36.2,35.4,406.9,2207,...,37.3,121.565,81.5,18.1,15.8,2.3,0.5,6882635,22846,11129
465,2018,3,District of Columbia,0.8,0.6,8.9,27.1,62.7,627.0,2506,...,50.6,151.147,84.2,15.6,12.3,3.3,0.2,701547,6641,2431
466,2018,4,Utah,1.9,1.2,44.5,37.5,14.9,310.0,1531,...,19.3,93.060,84.7,14.7,11.0,3.7,0.6,3153550,52508,21841


In [4]:
# List columns for calculations
df_us_data.columns.to_list()

['Year',
 'Region',
 'NAME',
 'Home Value <$50k (%)',
 'Home Value $50k-$99,999k (%)',
 'Home Value $100k-$299,999 (%)',
 'Home Value $300k-$499,999k (%)',
 'Home Value $500k+ (%)',
 'Median Home Value ($)(1000X)',
 'Median Housing Cost (monthly) ($)',
 'Income <$10k (%)',
 'Income $10k-$24,999 (%)',
 'Income $25k-$34,999 (%)',
 'Income $35k-$49,999 (%)',
 'Income $50k-$74,999 (%)',
 'Income $75k-$99,999 (%)',
 'Income $100k-$149,999 (%)',
 'Income $150k+ (%)',
 'Median Income ($)(1000X)',
 'No 2nd or Equity Loan (%)',
 '2nd Mortgage or Equity Loan (%)',
 'Equity Loan Only (%)',
 '2nd Mortgage Only (%)',
 'Both 2nd & Equity Loan (%)',
 'Population',
 'Net Population Change',
 'Net Migration']

In [5]:
# Create array for the number of medium incomes needed for monthly housing cost
cost_to_income = []
for i in range(len(df_us_data)):
    cost_to_income.append(df_us_data['Median Housing Cost (monthly) ($)'][i] / (df_us_data['Median Income ($)(1000X)'][i] * 1000 / 12))
    i += 1
cost_to_income

[0.21883674391582209,
 0.22442926952832276,
 0.2600580110912397,
 0.19941743976562892,
 0.3041924833793135,
 0.24378090382920004,
 0.2631657087106832,
 0.24226671470482264,
 0.24546280299926976,
 0.290839989693378,
 0.247389653535833,
 0.3025085249333198,
 0.23368825160369464,
 0.2574740062747945,
 0.2066350710900474,
 0.20124749911733553,
 0.2103178532527973,
 0.20918093565539783,
 0.20479859123926922,
 0.2350474106491612,
 0.2464924346629986,
 0.2580072865515603,
 0.23873957367933277,
 0.2341060720126684,
 0.2200499314321882,
 0.21519928388281168,
 0.2348249746747922,
 0.20556383786672666,
 0.27351283656856606,
 0.2636283878310768,
 0.28444551128180506,
 0.230426378260939,
 0.26825490821300046,
 0.2313493838394744,
 0.18576054432922082,
 0.22452473195783404,
 0.20831473570107759,
 0.26873810673407367,
 0.2288348355764086,
 0.2761956072319046,
 0.22502270301272964,
 0.20239438477206453,
 0.22623861255906855,
 0.22451457930206178,
 0.24206762577775276,
 0.25240909487903573,
 0.23831195

In [6]:
# Create "Cost_to_income_ratio" column
df_us_data['Cost_to_income_ratio'] = cost_to_income
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),...,Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio
0,2010,3,Alabama,9.1,23.6,61.7,8.7,3.7,142.7,1130,...,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837
1,2010,4,Alaska,2.0,2.9,28.1,26.5,8.1,255.7,1772,...,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429
2,2010,4,Arizona,4.0,13.4,53.4,14.4,6.8,177.0,1442,...,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058
3,2010,3,Arkansas,12.0,27.0,67.1,6.4,2.1,122.6,987,...,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417
4,2010,4,California,1.5,3.4,18.0,29.1,34.0,377.7,2242,...,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,2.6,6.4,55.1,24.6,11.4,247.0,1471,...,80.372,89.1,10.5,8.5,2.0,0.4,21244317,280704,267130,0.219629
464,2018,1,Massachusetts,1.0,0.8,26.7,36.2,35.4,406.9,2207,...,121.565,81.5,18.1,15.8,2.3,0.5,6882635,22846,11129,0.217859
465,2018,3,District of Columbia,0.8,0.6,8.9,27.1,62.7,627.0,2506,...,151.147,84.2,15.6,12.3,3.3,0.2,701547,6641,2431,0.198959
466,2018,4,Utah,1.9,1.2,44.5,37.5,14.9,310.0,1531,...,93.060,84.7,14.7,11.0,3.7,0.6,3153550,52508,21841,0.197421


In [54]:
# Find median housing cost per year
avg_median_housing = df_us_data['Median Housing Cost (monthly) ($)'].groupby(df_us_data['Year']).mean()
avg_median_housing_df = pd.DataFrame(avg_median_housing)
avg_median_housing_df.reset_index(inplace=True)
avg_median_housing_df

Unnamed: 0,Year,Median Housing Cost (monthly) ($)
0,2010,1458.153846
1,2011,1454.980769
2,2012,1434.173077
3,2013,1412.788462
4,2014,1428.711538
5,2015,1451.826923
6,2016,1459.980769
7,2017,1484.442308
8,2018,1530.192308


In [53]:
# Create array for binary category
leave = []
for j in range(len(avg_median_housing_df)):
    for i in range(len(df_us_data)):
        if df_us_data['Year'][i] == avg_median_housing_df['Year'][j]:
            if df_us_data['Median Housing Cost (monthly) ($)'][i] > avg_median_housing_df['Median Housing Cost (monthly) ($)'][j] and df_us_data['Cost_to_income_ratio'][i] > 0.2:
                    leave.append(1)
            else:
                leave.append(0)
        i += 1     
    j += 1
leave

[0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,


In [8]:
# Create binary column in df_us_data
df_us_data['Leave'] = leave
df_us_data

Unnamed: 0,Year,Region,NAME,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),...,No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Cost_to_income_ratio,Leave
0,2010,3,Alabama,9.1,23.6,61.7,8.7,3.7,142.7,1130,...,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0.218837,0
1,2010,4,Alaska,2.0,2.9,28.1,26.5,8.1,255.7,1772,...,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0.224429,0
2,2010,4,Arizona,4.0,13.4,53.4,14.4,6.8,177.0,1442,...,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0.260058,0
3,2010,3,Arkansas,12.0,27.0,67.1,6.4,2.1,122.6,987,...,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0.199417,0
4,2010,4,California,1.5,3.4,18.0,29.1,34.0,377.7,2242,...,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,0.304192,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,3,Florida,2.6,6.4,55.1,24.6,11.4,247.0,1471,...,89.1,10.5,8.5,2.0,0.4,21244317,280704,267130,0.219629,0
464,2018,1,Massachusetts,1.0,0.8,26.7,36.2,35.4,406.9,2207,...,81.5,18.1,15.8,2.3,0.5,6882635,22846,11129,0.217859,0
465,2018,3,District of Columbia,0.8,0.6,8.9,27.1,62.7,627.0,2506,...,84.2,15.6,12.3,3.3,0.2,701547,6641,2431,0.198959,0
466,2018,4,Utah,1.9,1.2,44.5,37.5,14.9,310.0,1531,...,84.7,14.7,11.0,3.7,0.6,3153550,52508,21841,0.197421,0


In [9]:
# Create States DataFrame with names of states
state_names_df = df_us_data[['Year', 'Region', 'NAME']]
state_names_df.head()

Unnamed: 0,Year,Region,NAME
0,2010,3,Alabama
1,2010,4,Alaska
2,2010,4,Arizona
3,2010,3,Arkansas
4,2010,4,California


In [10]:
# Remove Region, NAME and Cost_to_income_ratio columns to prepare for ML
clean_us_data_df = df_us_data.drop(columns=['Region', 'NAME', 'Cost_to_income_ratio'])
clean_us_data_df

Unnamed: 0,Year,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),Income <$10k (%),"Income $10k-$24,999 (%)",...,Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Leave
0,2010,9.1,23.6,61.7,8.7,3.7,142.7,1130,3.9,10.4,...,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168,0
1,2010,2.0,2.9,28.1,26.5,8.1,255.7,1772,1.2,4.0,...,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598,0
2,2010,4.0,13.4,53.4,14.4,6.8,177.0,1442,2.8,8.8,...,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672,0
3,2010,12.0,27.0,67.1,6.4,2.1,122.6,987,2.8,10.9,...,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270,0
4,2010,1.5,3.4,18.0,29.1,34.0,377.7,2242,2.1,5.9,...,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,2018,2.6,6.4,55.1,24.6,11.4,247.0,1471,2.8,7.3,...,80.372,89.1,10.5,8.5,2.0,0.4,21244317,280704,267130,0
464,2018,1.0,0.8,26.7,36.2,35.4,406.9,2207,1.3,3.1,...,121.565,81.5,18.1,15.8,2.3,0.5,6882635,22846,11129,0
465,2018,0.8,0.6,8.9,27.1,62.7,627.0,2506,2.2,2.8,...,151.147,84.2,15.6,12.3,3.3,0.2,701547,6641,2431,0
466,2018,1.9,1.2,44.5,37.5,14.9,310.0,1531,1.0,3.7,...,93.060,84.7,14.7,11.0,3.7,0.6,3153550,52508,21841,0


In [11]:
# Check datatypes for all columns in preparation of ML
clean_us_data_df.dtypes

Year                                   int64
Home Value <$50k (%)                 float64
Home Value $50k-$99,999k (%)         float64
Home Value $100k-$299,999 (%)        float64
Home Value $300k-$499,999k (%)       float64
Home Value $500k+ (%)                float64
Median Home Value ($)(1000X)         float64
Median Housing Cost (monthly) ($)      int64
Income <$10k (%)                     float64
Income $10k-$24,999 (%)              float64
Income $25k-$34,999 (%)              float64
Income $35k-$49,999 (%)              float64
Income $50k-$74,999 (%)              float64
Income $75k-$99,999 (%)              float64
Income $100k-$149,999 (%)            float64
Income $150k+ (%)                    float64
Median Income ($)(1000X)             float64
No 2nd or Equity Loan (%)            float64
2nd Mortgage or Equity Loan (%)      float64
Equity Loan Only (%)                 float64
2nd Mortgage Only (%)                float64
Both 2nd & Equity Loan (%)           float64
Population

## Preprocess data

In [12]:
# Define the features set
X = clean_us_data_df.copy()
X = X.drop('Leave', axis=1)
X.head()

Unnamed: 0,Year,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),Income <$10k (%),"Income $10k-$24,999 (%)",...,Income $150k+ (%),Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration
0,2010,9.1,23.6,61.7,8.7,3.7,142.7,1130,3.9,10.4,...,8.7,61.964,82.3,17.2,12.3,4.9,0.6,4785437,5312,2168
1,2010,2.0,2.9,28.1,26.5,8.1,255.7,1772,1.2,4.0,...,19.1,94.747,84.4,14.9,10.4,4.5,0.7,713910,3661,1598
2,2010,4.0,13.4,53.4,14.4,6.8,177.0,1442,2.8,8.8,...,10.2,66.539,77.4,21.8,16.0,5.8,0.8,6407172,14884,5672
3,2010,12.0,27.0,67.1,6.4,2.1,122.6,987,2.8,10.9,...,7.1,59.393,89.3,10.5,6.4,4.1,0.2,2921964,5933,3270
4,2010,1.5,3.4,18.0,29.1,34.0,377.7,2242,2.1,5.9,...,21.2,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721


In [13]:
# Define the target set
y = clean_us_data_df['Leave'].ravel()
y[:5]

array([0, 0, 0, 0, 1])

In [14]:
# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Create instance of StandardScaler
scaler = StandardScaler()
# Fit StandardScaler with training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Random Forest Model

In [16]:
# Create a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Make predictions using test data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0])

## Evaluate the Model

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [18]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,112,1
Actual 1,2,2


Accuracy Score: 0.9743589743589743
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       113
           1       0.67      0.50      0.57         4

    accuracy                           0.97       117
   macro avg       0.82      0.75      0.78       117
weighted avg       0.97      0.97      0.97       117



In [19]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.00837592, 0.03961154, 0.00634404, 0.12341013, 0.01309729,
       0.03076381, 0.02533942, 0.04073077, 0.02209869, 0.03637046,
       0.07268456, 0.03378648, 0.02015135, 0.0673789 , 0.04027455,
       0.05472426, 0.08427668, 0.03521012, 0.03474107, 0.02159516,
       0.0081501 , 0.15436024, 0.01284489, 0.00588615, 0.00779339])

In [20]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1543602413824768, 'Both 2nd & Equity Loan (%)'),
 (0.12341013386223289, 'Home Value $100k-$299,999 (%)'),
 (0.08427668393790247, 'Median Income ($)(1000X)'),
 (0.07268456358094519, 'Income $25k-$34,999 (%)'),
 (0.06737890203718679, 'Income $75k-$99,999 (%)'),
 (0.05472426396509949, 'Income $150k+ (%)'),
 (0.04073077336100674, 'Median Housing Cost (monthly) ($)'),
 (0.04027455318112779, 'Income $100k-$149,999 (%)'),
 (0.039611535607350236, 'Home Value <$50k (%)'),
 (0.03637045871458566, 'Income $10k-$24,999 (%)'),
 (0.03521012227990788, 'No 2nd or Equity Loan (%)'),
 (0.034741065789845295, '2nd Mortgage or Equity Loan (%)'),
 (0.03378648237299679, 'Income $35k-$49,999 (%)'),
 (0.030763812608851724, 'Home Value $500k+ (%)'),
 (0.025339424178118485, 'Median Home Value ($)(1000X)'),
 (0.022098688943051427, 'Income <$10k (%)'),
 (0.0215951610317818, 'Equity Loan Only (%)'),
 (0.020151350103431012, 'Income $50k-$74,999 (%)'),
 (0.013097293601139735, 'Home Value $300k-$499,999k (%)'),
 (0

In [24]:
# Create a California data set to predict
df_ca = state_names_df[state_names_df['NAME'] == 'California']
df_ca

Unnamed: 0,Year,Region,NAME
4,2010,4,California
56,2011,4,California
108,2012,4,California
160,2013,4,California
212,2014,4,California
264,2015,4,California
316,2016,4,California
368,2017,4,California
425,2018,4,California


In [28]:
ca_list = df_ca.index.tolist()
ca_list

[4, 56, 108, 160, 212, 264, 316, 368, 425]

In [29]:
df_ca_clean_data = clean_us_data_df.loc[ca_list, :]
df_ca_clean_data

Unnamed: 0,Year,Home Value <$50k (%),"Home Value $50k-$99,999k (%)","Home Value $100k-$299,999 (%)","Home Value $300k-$499,999k (%)",Home Value $500k+ (%),Median Home Value ($)(1000X),Median Housing Cost (monthly) ($),Income <$10k (%),"Income $10k-$24,999 (%)",...,Median Income ($)(1000X),No 2nd or Equity Loan (%),2nd Mortgage or Equity Loan (%),Equity Loan Only (%),2nd Mortgage Only (%),Both 2nd & Equity Loan (%),Population,Net Population Change,Net Migration,Leave
4,2010,1.5,3.4,18.0,29.1,34.0,377.7,2242,2.1,5.9,...,88.444,72.2,26.3,18.6,7.6,1.6,37319502,64983,-721,1
56,2011,1.6,3.8,19.8,28.5,32.2,363.6,2182,2.3,6.0,...,89.279,75.6,23.0,16.6,6.3,1.4,37638369,318867,47853,1
108,2012,1.7,4.1,20.6,28.5,31.3,358.1,2119,2.3,5.7,...,91.024,78.0,20.9,15.3,5.6,1.1,37948800,310431,56502,0
160,2013,2.8,3.2,16.9,29.2,34.9,383.2,2059,2.2,5.6,...,93.538,80.4,18.5,13.8,4.7,1.1,38260787,311987,62836,0
212,2014,2.7,2.1,12.4,30.0,40.2,427.7,2068,2.1,5.4,...,96.305,82.1,17.0,12.9,4.1,0.8,38596972,336185,84680,0
264,2015,1.1,1.7,22.2,30.5,44.6,462.0,2123,2.0,5.1,...,100.223,83.6,15.7,12.0,3.6,0.8,38918045,321073,76932,0
316,2016,1.1,1.4,19.4,29.4,48.8,491.1,2188,2.0,4.7,...,103.266,84.2,15.2,12.0,3.1,0.7,39167117,249072,19523,0
368,2017,1.0,1.1,16.6,28.5,52.9,529.0,2269,1.9,4.5,...,108.609,85.8,13.6,10.8,2.8,0.5,39358497,191380,-24732,0
425,2018,1.0,1.0,14.1,27.1,56.8,566.1,2345,2.0,4.1,...,113.027,86.6,12.9,10.3,2.6,0.6,39461588,103091,-83981,0
