In [1]:
import pandas as pd
import numpy as np
from category_encoders import BinaryEncoder
from sklearn.preprocessing import StandardScaler

In [2]:
# read in the cleaned data for machine learning
ml_df = pd.read_csv('../CSV Files/Combined_ML.csv')
ml_df.head()

Unnamed: 0,month_date_yyyymm,postal_code,nielsen_hh_rank,hotness_rank,hotness_rank_mm,hotness_rank_yy,hotness_score,supply_score,demand_score,median_days_on_market_x,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,months_since_reference
0,2023-09-01,14450,998.0,1.0,-5.0,0.0,99.978698,99.974438,99.982958,7.0,...,439999.0,-0.109,0.1193,26.0,0.1556,0.0,0.1667,0.0691,-0.0476,73
1,2023-09-01,4106,3627.0,2.0,-2.0,0.0,99.923313,99.906271,99.940354,9.0,...,669841.0,0.0503,0.0367,27.0,0.2857,0.6875,0.2727,0.1675,0.0327,73
2,2023-09-01,14624,1837.0,3.0,-4.0,1.0,99.919052,99.965917,99.872188,8.0,...,293467.0,0.097,0.1037,20.0,-0.0476,-0.1837,0.1765,0.0376,0.0602,73
3,2023-09-01,6111,3095.0,4.0,-1064.0,-15.0,99.897751,99.838105,99.957396,10.0,...,307199.0,-0.0268,-0.0417,65.0,-0.0299,-0.3085,3.2667,-0.0146,0.9274,73
4,2023-09-01,1970,884.0,5.0,-7.0,-5.0,99.89349,99.88923,99.897751,9.5,...,543278.0,0.0739,0.0137,35.0,0.4583,-0.3069,0.4,0.0176,0.0667,73


In [3]:
ml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 926158 entries, 0 to 926157
Data columns (total 45 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   month_date_yyyymm                        926158 non-null  object 
 1   postal_code                              926158 non-null  int64  
 2   nielsen_hh_rank                          926158 non-null  float64
 3   hotness_rank                             926158 non-null  float64
 4   hotness_rank_mm                          926158 non-null  float64
 5   hotness_rank_yy                          926158 non-null  float64
 6   hotness_score                            926158 non-null  float64
 7   supply_score                             926158 non-null  float64
 8   demand_score                             926158 non-null  float64
 9   median_days_on_market_x                  926158 non-null  float64
 10  median_days_on_market_mm_x      

In [4]:
ml_df = ml_df.drop('month_date_yyyymm', axis=1)

In [5]:
# shift the hotness score to create teh target variable for the next period
ml_df['hotness_score_next_period'] = ml_df['hotness_score'].shift(-1)

# drop the last row since it will have NaN for the target variable
ml_df = ml_df[:-1]

# check that the column has been created.
ml_df.head()

Unnamed: 0,postal_code,nielsen_hh_rank,hotness_rank,hotness_rank_mm,hotness_rank_yy,hotness_score,supply_score,demand_score,median_days_on_market_x,median_days_on_market_mm_x,...,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,months_since_reference,hotness_score_next_period
0,14450,998.0,1.0,-5.0,0.0,99.978698,99.974438,99.982958,7.0,-0.333333,...,-0.109,0.1193,26.0,0.1556,0.0,0.1667,0.0691,-0.0476,73,99.923313
1,4106,3627.0,2.0,-2.0,0.0,99.923313,99.906271,99.940354,9.0,-0.1,...,0.0503,0.0367,27.0,0.2857,0.6875,0.2727,0.1675,0.0327,73,99.919052
2,14624,1837.0,3.0,-4.0,1.0,99.919052,99.965917,99.872188,8.0,0.032258,...,0.097,0.1037,20.0,-0.0476,-0.1837,0.1765,0.0376,0.0602,73,99.897751
3,6111,3095.0,4.0,-1064.0,-15.0,99.897751,99.838105,99.957396,10.0,-0.72973,...,-0.0268,-0.0417,65.0,-0.0299,-0.3085,3.2667,-0.0146,0.9274,73,99.89349
4,1970,884.0,5.0,-7.0,-5.0,99.89349,99.88923,99.897751,9.5,-0.173913,...,0.0739,0.0137,35.0,0.4583,-0.3069,0.4,0.0176,0.0667,73,99.88923


In [6]:
# use binary encoder for the postal codes
binary_encoder = BinaryEncoder(cols=['postal_code'])

encoded_data = binary_encoder.fit_transform(ml_df['postal_code'])

ml_df_encoded = pd.concat([ml_df.drop('postal_code', axis=1), encoded_data], axis=1)

ml_df_encoded.head()

Unnamed: 0,nielsen_hh_rank,hotness_rank,hotness_rank_mm,hotness_rank_yy,hotness_score,supply_score,demand_score,median_days_on_market_x,median_days_on_market_mm_x,median_dom_mm_day,...,postal_code_5,postal_code_6,postal_code_7,postal_code_8,postal_code_9,postal_code_10,postal_code_11,postal_code_12,postal_code_13,postal_code_14
0,998.0,1.0,-5.0,0.0,99.978698,99.974438,99.982958,7.0,-0.333333,-3.5,...,0,0,0,0,0,0,0,0,0,1
1,3627.0,2.0,-2.0,0.0,99.923313,99.906271,99.940354,9.0,-0.1,-1.0,...,0,0,0,0,0,0,0,0,1,0
2,1837.0,3.0,-4.0,1.0,99.919052,99.965917,99.872188,8.0,0.032258,0.25,...,0,0,0,0,0,0,0,0,1,1
3,3095.0,4.0,-1064.0,-15.0,99.897751,99.838105,99.957396,10.0,-0.72973,-27.0,...,0,0,0,0,0,0,0,1,0,0
4,884.0,5.0,-7.0,-5.0,99.89349,99.88923,99.897751,9.5,-0.173913,-2.0,...,0,0,0,0,0,0,0,1,0,1


In [7]:
# scale the non-encoded and non-target columns
features_scale = [col for col in ml_df_encoded.columns if 'postal_code' not in col and col != 'hotness_score_next_period']

scaler = StandardScaler()

ml_df_encoded[features_scale] = scaler.fit_transform(ml_df_encoded[features_scale])

print(ml_df_encoded.head())

   nielsen_hh_rank  hotness_rank  hotness_rank_mm  hotness_rank_yy  \
0        -1.207238     -1.623798        -0.013019         0.112626   
1        -0.716384     -1.623549        -0.011041         0.112626   
2        -1.050590     -1.623300        -0.012360         0.113058   
3        -0.815712     -1.623050        -0.711472         0.106151   
4        -1.228523     -1.622801        -0.014338         0.110468   

   hotness_score  supply_score  demand_score  median_days_on_market_x  \
0       2.054973      1.741926      1.730009                -1.677816   
1       2.052697      1.739557      1.728531                -1.616688   
2       2.052522      1.741630      1.726167                -1.647252   
3       2.051647      1.737189      1.729122                -1.586125   
4       2.051472      1.738965      1.727053                -1.601407   

   median_days_on_market_mm_x  median_dom_mm_day  ...  postal_code_5  \
0                   -1.313531          -0.219221  ...              0