In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_selection import VarianceThreshold

In [2]:
pd.options.display.max_columns = None

In [3]:
df = pd.read_csv('/Users/jordonez/Desktop/Brainstation/CapstoneRepo/data/checkpoint3.csv')

In [4]:
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df['date'] = pd.to_datetime(df['date'])

In [5]:
# we need to standardize by taking into account inflation rates for different years in the us
# https://www.usinflationcalculator.com/
# this how much $1 in 2024 would be in the stated year

new_price_ratios = {
    2012: 0.73,
    2013: 0.74,
    2014: 0.75,
    2015: 0.75,
    2016: 0.76,
    2017: 0.78,
    2018: 0.80,
    2019: 0.81,
    2020: 0.82,
    2021: 0.86,
    2022: 0.93,
    2023: 0.97,
    2024: 1.00
}

Objective: Adjust all of the prices in the data to match they would be in 2024 (accounting for inflation). 
We need to adjust the following prices:

1. median_sale_price
2. median_list_price
3. median_ppsf
4. median_list_ppsf

In [6]:
# we need to make a function given a month, year an inflation rate returns the adjusted price

def adjustPriceForInflation(oldPrice, year):
    new_price = round(oldPrice * new_price_ratios[year],2)
    return new_price

In [7]:
df_copy = df.copy(deep=True)

In [8]:
# Apply the inflation adjustment to the specified columns
df_copy['median_sale_price'] = df_copy.apply(lambda row: adjustPriceForInflation(row['median_sale_price'], row['year']), axis=1)
df_copy['median_list_price'] = df_copy.apply(lambda row: adjustPriceForInflation(row['median_list_price'], row['year']), axis=1)
df_copy['median_ppsf'] = df_copy.apply(lambda row: adjustPriceForInflation(row['median_ppsf'], row['year']), axis=1)
df_copy['median_list_ppsf'] = df_copy.apply(lambda row: adjustPriceForInflation(row['median_list_ppsf'], row['year']), axis=1)

In [9]:
# investigating other variables

In [10]:
df_copy['quarter'] = pd.to_datetime(df_copy['date']).dt.quarter

In [11]:
def get_season(month):
    if month in [12, 1, 2]:
        return 0
    elif month in [3, 4, 5]:
        return 1
    elif month in [6, 7, 8]:
        return 2
    else:
        return 3

df_copy['season'] = df_copy['month'].apply(get_season)

In [12]:
df_copy['price_diff'] = df_copy['median_list_price'] - df_copy['median_sale_price']

In [13]:
df_copy['ppsf_diff'] = df_copy['median_list_ppsf'] - df_copy['median_ppsf']

In [14]:
df_copy['sale_to_list_ratio'] = df_copy['median_sale_price'] / df_copy['median_list_price']

In [15]:
df_copy['price_growth'] = df_copy.groupby(['city', 'property_type'])['median_sale_price'].pct_change()

In [16]:
df_copy['owner_utility'] = (df_copy['median_list_price'] - df_copy['median_sale_price']) / df_copy['median_list_price']

In [17]:
df_copy['buyer_utility'] = (df_copy['median_sale_price'] - df_copy['median_list_price']) / df_copy['median_sale_price']

In [18]:
df_copy['pending_sales_ratio'] = df_copy['pending_sales'] / df_copy['new_listings']

In [19]:
df_copy['sales_success_rate'] = df_copy['homes_sold'] / df_copy['new_listings']

In [20]:
df_copy['inventory_turnover'] = df_copy['homes_sold'] / df_copy['inventory']

In [21]:
df_copy['pct_sold_above_list'] = df_copy['sold_above_list'] / df_copy['homes_sold']

In [22]:
df_copy['price_drop_rate'] = df_copy['price_drops'] / df_copy['new_listings']

In [23]:
df_copy['adjusted_months_supply'] = df_copy['months_of_supply'] / df_copy['homes_sold']

In [24]:
df_copy['political_stance_encoded'] = df_copy['senate_political_stance'].map({'democratic': 1, 'republican': 0, 'mixed': 2})

In [25]:
df_copy['supply_to_list_ratio'] = df_copy['inventory'] / df_copy['new_listings']

In [26]:
df_copy = pd.get_dummies(df_copy, columns=['property_type'], dtype='int')

In [27]:
df_copy['state_avg_sale_price'] = df_copy.groupby('state')['median_sale_price'].transform('mean')

In [28]:
df_copy['metro_region_inventory_change'] = df_copy.groupby('parent_metro_region')['inventory'].pct_change()

In [29]:
df_copy['dom_per_home'] = df_copy['median_dom'] / df_copy['homes_sold']

In [30]:
df_copy['inventory_to_pending_ratio'] = df_copy['inventory'] / df_copy['pending_sales']

In [31]:
df_copy['rolling_median_sale_price'] = df_copy['median_sale_price'].rolling(window=3).mean()

In [32]:
df_copy['rolling_median_list_price'] = df_copy['median_list_price'].rolling(window=3).mean()

In [33]:
df_copy['price_momentum'] = df_copy['median_sale_price'].diff() / df_copy['median_sale_price'].shift(1)

In [34]:
df_copy['supply_pressure'] = df_copy['new_listings'] / df_copy['homes_sold']

In [35]:
df_copy['demand_pressure'] = df_copy['pending_sales'] / df_copy['inventory']

In [36]:
df_copy['price_elasticity'] = df_copy['pending_sales'].pct_change() / df_copy['median_sale_price'].pct_change()

In [37]:
us_regions = {
    'New England': ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut'],
    'Mid-Atlantic': ['New York', 'New Jersey', 'Pennsylvania'],
    'East North Central': ['Ohio', 'Michigan', 'Indiana', 'Illinois', 'Wisconsin'],
    'West North Central': ['Minnesota', 'Iowa', 'Missouri', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas'],
    'South Atlantic': ['Delaware', 'Maryland', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida'],
    'East South Central': ['Kentucky', 'Tennessee', 'Mississippi', 'Alabama'],
    'West South Central': ['Arkansas', 'Louisiana', 'Texas', 'Oklahoma'],
    'Mountain': ['Montana', 'Idaho', 'Wyoming', 'Colorado', 'New Mexico', 'Arizona', 'Utah', 'Nevada'],
    'Pacific': ['Washington', 'Oregon', 'California', 'Alaska', 'Hawaii']
}


In [38]:
def binByUSRegion(state):
    for region, states in us_regions.items():
        if state in states:
            return region

In [39]:
df_copy['us_region'] = df_copy.apply(lambda row: binByUSRegion(row['state']), axis=1)

In [40]:
df_copy.us_region.unique()

array(['South Atlantic', 'Pacific', 'Mid-Atlantic', 'East South Central',
       'East North Central', 'West North Central', 'Mountain',
       'West South Central', 'New England'], dtype=object)

In [41]:
# make dummies of the us_region
df_copy = pd.get_dummies(df_copy, columns=['us_region'], dtype='int')

In [42]:
# drop columns with repeated data
df_copy = df_copy.drop(columns=['city', 'state','parent_metro_region','senate_political_stance'])

In [43]:
df_copy.isna().sum()

median_sale_price                              0
median_list_price                              0
median_ppsf                                    0
median_list_ppsf                               0
homes_sold                                     0
pending_sales                                  0
new_listings                                   0
inventory                                      0
months_of_supply                               0
median_dom                                     0
avg_sale_to_list                               0
sold_above_list                                0
price_drops                                    0
off_market_in_two_weeks                        0
year                                           0
month                                          0
date                                           0
quarter                                        0
season                                         0
price_diff                                     0
ppsf_diff           

In [44]:
df_copy = df_copy.dropna()

In [45]:
df_copy.isna().sum()

median_sale_price                          0
median_list_price                          0
median_ppsf                                0
median_list_ppsf                           0
homes_sold                                 0
pending_sales                              0
new_listings                               0
inventory                                  0
months_of_supply                           0
median_dom                                 0
avg_sale_to_list                           0
sold_above_list                            0
price_drops                                0
off_market_in_two_weeks                    0
year                                       0
month                                      0
date                                       0
quarter                                    0
season                                     0
price_diff                                 0
ppsf_diff                                  0
sale_to_list_ratio                         0
price_grow

In [46]:
df_copy.describe()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


Unnamed: 0,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,price_drops,off_market_in_two_weeks,year,month,date,quarter,season,price_diff,ppsf_diff,sale_to_list_ratio,price_growth,owner_utility,buyer_utility,pending_sales_ratio,sales_success_rate,inventory_turnover,pct_sold_above_list,price_drop_rate,adjusted_months_supply,political_stance_encoded,supply_to_list_ratio,property_type_All Residential,property_type_Condo/Co-op,property_type_Multi-Family (2-4 Unit),property_type_Single Family Residential,property_type_Townhouse,state_avg_sale_price,metro_region_inventory_change,dom_per_home,inventory_to_pending_ratio,rolling_median_sale_price,rolling_median_list_price,price_momentum,supply_pressure,demand_pressure,price_elasticity,us_region_East North Central,us_region_East South Central,us_region_Mid-Atlantic,us_region_Mountain,us_region_New England,us_region_Pacific,us_region_South Atlantic,us_region_West North Central,us_region_West South Central
count,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0,5427216.0
mean,261621.9,287439.4,160.9351,175.7599,14.70899,14.00407,19.66011,47.14562,4.168112,99.13312,0.9737626,0.2326873,0.2654781,0.2782676,2018.096,6.510994,2018-07-22 02:43:56.790574336,2.499805,1.517455,25817.52,14.82488,0.9863783,1.983274,0.0136217,-2.128982,1.517052,0.9479263,0.497149,0.08587737,0.07821524,2.075089,1.089802,3.321353,0.3869271,0.0973114,0.05613762,0.3794616,0.08016228,260501.9,5.362082,53.81633,5.842027,261485.6,287341.9,4.73785,3.36254,1.049135,,0.1668616,0.04836365,0.1836098,0.061765,0.07216444,0.13635,0.1843527,0.07248283,0.07404994
min,0.73,217.5,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.5,0.0,0.0001090869,0.0,2012.0,1.0,2012-01-01 00:00:00,1.0,0.0,-79792000.0,-11190510.0,3.271188e-06,-0.9999977,-2700.149,-305698.4,0.00408413,0.003076923,0.0008598452,0.0,6.348213e-08,0.0,0.0,0.009429197,0.0,0.0,0.0,0.0,0.0,130778.3,-0.999952,0.002396645,0.01097303,4175.0,15813.33,-0.9999991,0.01115242,0.001169591,-inf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,106400.0,130117.2,67.7,78.13,1.0,2.0,3.0,5.0,1.7,29.0,0.9547502,0.0,0.1818182,0.0,2015.0,4.0,2015-09-01 00:00:00,2.0,1.0,-19125.0,-7.35,0.6850439,-0.2180451,-0.1045545,-0.4597605,0.4863823,0.4,0.1666667,0.0,0.01100324,0.2,1.0,1.305875,0.0,0.0,0.0,0.0,0.0,160228.9,-0.7088608,3.1875,1.283178,141825.0,166837.5,-0.5003125,0.9166667,0.1538462,-1.214073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,179310.0,207629.4,105.64,116.54,4.0,5.0,7.0,15.0,3.0,58.0,0.98,0.125,0.25,0.223784,2018.0,6.0,2018-09-01 00:00:00,2.0,2.0,14527.0,6.56,0.9214286,0.009677419,0.07857143,-0.08527132,0.7419355,0.7727273,0.3282828,0.004155125,0.03061224,0.75,1.0,2.333333,0.0,0.0,0.0,0.0,0.0,225078.7,0.0,12.6,3.0,209250.0,235950.0,0.0003711058,1.294118,0.3333333,0.08043291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,306660.0,336150.0,171.36,183.11,11.0,12.62492,18.0,39.20478,5.0,108.0,1.0,0.375,0.3217209,0.4444444,2021.0,9.0,2021-08-01 00:00:00,3.0,2.0,67754.0,27.21,1.104555,0.3100775,0.3149561,0.09465763,1.0,1.090909,0.5789474,0.04081633,0.09375,2.157346,2.0,4.0,1.0,0.0,0.0,1.0,0.0,316538.4,2.485604,47.0,6.5,314483.3,341504.7,1.009174,2.5,0.779315,1.682952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,82450000.0,750000000.0,11190690.0,36500000.0,4579.0,4112.0,7224.0,25592.0,1163.0,41391.0,2.0,1.0,1.0,2.0,2024.0,12.0,2024-06-01 00:00:00,4.0,3.0,749941400.0,36499930.0,2701.149,399999.0,0.9999967,0.9996298,91.13253,89.66667,81.0,1.0,1.0,1163.0,2.0,322.4649,1.0,1.0,1.0,1.0,1.0,613575.8,20809.0,36643.0,855.0,28345020.0,250271000.0,1784999.0,325.0,91.13253,inf,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,369577.4,988573.8,5465.162,22373.67,56.25032,48.6008,69.71796,197.7291,4.885,216.618,0.05956018,0.2932357,0.1508977,0.2852127,3.485524,3.388598,,1.102557,1.100645,967705.4,23024.65,2.569277,524.7621,2.569277,514.7714,3.288263,1.009584,0.6924583,0.224565,0.1188391,4.467248,0.757927,4.287846,0.4870468,0.2963814,0.2301873,0.4852531,0.2715443,118269.2,47.41926,189.8412,9.813178,226234.7,573249.2,1427.127,6.151635,2.705811,,0.3728523,0.2145335,0.3871657,0.2407283,0.25876,0.3431599,0.3877716,0.2592857,0.2618522


In [47]:
elasticity_is_inf = (df_copy['price_elasticity'] == float('inf')) | (df_copy['price_elasticity'] == float('-inf'))
df_copy.loc[elasticity_is_inf]

Unnamed: 0,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,price_drops,off_market_in_two_weeks,year,month,date,quarter,season,price_diff,ppsf_diff,sale_to_list_ratio,price_growth,owner_utility,buyer_utility,pending_sales_ratio,sales_success_rate,inventory_turnover,pct_sold_above_list,price_drop_rate,adjusted_months_supply,political_stance_encoded,supply_to_list_ratio,property_type_All Residential,property_type_Condo/Co-op,property_type_Multi-Family (2-4 Unit),property_type_Single Family Residential,property_type_Townhouse,state_avg_sale_price,metro_region_inventory_change,dom_per_home,inventory_to_pending_ratio,rolling_median_sale_price,rolling_median_list_price,price_momentum,supply_pressure,demand_pressure,price_elasticity,us_region_East North Central,us_region_East South Central,us_region_Mid-Atlantic,us_region_Mountain,us_region_New England,us_region_Pacific,us_region_South Atlantic,us_region_West North Central,us_region_West South Central
2684,51100.0,68948.5,51.17,47.28,3.0,2.000000,10.000000,46.000000,15.3,57.0,0.940853,0.000000,0.022727,0.000000,2012,1,2012-01-01,1,0,17848.5,-3.89,0.741133,-0.199543,0.258867,-0.349286,0.200000,0.300000,0.065217,0.000000,0.002273,5.100000,2,4.600000,1,0,0,0,0,140394.243047,14.333333,19.000000,23.000000,48423.333333,95711.516667,0.0,3.333333,0.043478,-inf,1,0,0,0,0,0,0,0,0
8193,81030.0,100302.0,39.92,80.14,2.0,4.617643,2.000000,31.000000,15.5,170.0,0.978564,0.000000,0.174256,0.122225,2012,1,2012-01-01,1,0,19272.0,40.22,0.807860,-0.603571,0.192140,-0.237838,2.308821,1.000000,0.064516,0.000000,0.087128,7.750000,1,15.500000,1,0,0,0,0,286615.516753,0.068966,85.000000,6.713382,62780.000000,101421.333333,0.0,1.000000,0.148956,inf,0,0,1,0,0,0,0,0,0
14768,112237.5,109427.0,72.93,76.53,16.0,6.000000,12.915145,71.689593,5.3,92.0,0.981102,0.000000,0.188112,0.000000,2012,1,2012-01-01,1,0,-2810.5,3.60,1.025684,0.282319,-0.025684,0.025041,0.464571,1.238856,0.223184,0.000000,0.014565,0.331250,0,5.550816,0,0,0,1,0,149892.839862,-0.158911,5.750000,11.948265,155514.333333,177317.000000,0.0,0.807197,0.083694,inf,0,0,0,0,0,0,0,0,1
19928,109500.0,47012.0,47.78,33.93,2.0,4.617643,2.000000,21.000000,10.5,201.0,0.936888,0.000000,0.174256,0.122225,2012,1,2012-01-01,1,0,-62488.0,-13.85,2.329193,1.307692,-1.329193,0.570667,2.308821,1.000000,0.095238,0.000000,0.087128,5.250000,1,10.500000,1,0,0,0,0,286615.516753,6.000000,100.500000,4.547775,135475.833333,124781.333333,0.0,1.000000,0.219888,-inf,0,0,1,0,0,0,0,0,0
25177,65700.0,726350.0,77.48,199.77,1.0,4.617643,1.000000,56.000000,56.0,165.0,0.900901,0.000000,0.174256,0.122225,2012,2,2012-02-01,1,0,660650.0,122.29,0.090452,-0.217391,0.909548,-10.055556,4.617643,1.000000,0.017857,0.000000,0.174256,56.000000,1,56.000000,1,0,0,0,0,286615.516753,55.000000,165.000000,12.127400,71783.333333,310688.000000,0.0,1.000000,0.082458,inf,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5482396,345000.0,380990.0,271.72,219.37,2.0,10.490476,1.000000,1.000000,0.5,45.0,0.961003,0.000000,0.282157,0.147578,2024,6,2024-06-01,2,2,35990.0,-52.35,0.905536,0.169492,0.094464,-0.104319,10.490476,2.000000,2.000000,0.000000,0.282157,0.250000,0,1.000000,0,1,0,0,0,204226.249476,-0.981818,22.500000,0.095325,930000.000000,663478.333333,0.0,0.500000,10.490476,-inf,0,0,0,0,0,0,1,0,0
5482446,255000.0,282450.0,240.79,223.87,5.0,3.000000,6.000000,8.000000,1.6,34.0,0.968216,0.000000,0.250000,0.333333,2024,6,2024-06-01,2,2,27450.0,-16.92,0.902815,-0.046729,0.097185,-0.107647,0.500000,0.833333,0.625000,0.000000,0.041667,0.320000,1,1.333333,1,0,0,0,0,542114.131124,0.142857,6.800000,2.666667,263716.666667,324450.000000,0.0,1.200000,0.375000,-inf,0,0,0,0,0,1,0,0,0
5482495,785000.0,1350000.0,705.09,820.80,11.0,10.000000,9.000000,16.000000,1.5,25.0,1.023532,0.727273,0.250000,0.400000,2024,6,2024-06-01,2,2,565000.0,115.71,0.581481,-0.156368,0.418519,-0.719745,1.111111,1.222222,0.687500,0.066116,0.027778,0.136364,1,1.777778,0,1,0,0,0,542114.131124,-0.714286,2.272727,1.600000,600666.666667,749992.543333,0.0,0.818182,0.625000,-inf,0,0,0,0,0,1,0,0,0
5483854,440000.0,209900.0,366.67,140.45,3.0,1.866667,3.000000,2.095238,1.0,105.0,0.999588,0.080108,0.666667,0.666667,2024,6,2024-06-01,2,2,-230100.0,-226.22,2.096236,1.699387,-1.096236,0.522955,0.622222,1.000000,1.431818,0.026703,0.222222,0.333333,0,0.698413,0,0,0,0,1,152280.647413,-0.914175,35.000000,1.122449,696000.000000,616633.333333,0.0,1.000000,0.890909,-inf,0,0,0,0,0,0,0,1,0


In [48]:
# let's drop these rows with -inf or inf
df_copy = df_copy.drop(df_copy.loc[elasticity_is_inf].index)

In [49]:
# finally round off number of homes sold since can't have decimal homes
df['homes_sold'] = round(df['homes_sold'])

In [50]:
df_copy.describe()

Unnamed: 0,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,price_drops,off_market_in_two_weeks,year,month,date,quarter,season,price_diff,ppsf_diff,sale_to_list_ratio,price_growth,owner_utility,buyer_utility,pending_sales_ratio,sales_success_rate,inventory_turnover,pct_sold_above_list,price_drop_rate,adjusted_months_supply,political_stance_encoded,supply_to_list_ratio,property_type_All Residential,property_type_Condo/Co-op,property_type_Multi-Family (2-4 Unit),property_type_Single Family Residential,property_type_Townhouse,state_avg_sale_price,metro_region_inventory_change,dom_per_home,inventory_to_pending_ratio,rolling_median_sale_price,rolling_median_list_price,price_momentum,supply_pressure,demand_pressure,price_elasticity,us_region_East North Central,us_region_East South Central,us_region_Mid-Atlantic,us_region_Mountain,us_region_New England,us_region_Pacific,us_region_South Atlantic,us_region_West North Central,us_region_West South Central
count,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0,5420909.0
mean,261694.8,287491.2,160.9777,175.7154,14.70798,14.00296,19.65868,47.14426,4.168222,99.13457,0.9737643,0.2326896,0.2654708,0.2782586,2018.096,6.511013,2018-07-22 00:11:29.362761472,2.499811,1.517453,25796.36,14.73775,0.9864332,1.985286,0.01356679,-2.131023,1.516825,0.9479957,0.4971668,0.08586697,0.07820949,2.074889,1.089814,3.321446,0.3869334,0.09729309,0.05612601,0.3794869,0.08016054,260511.5,5.361722,53.81016,5.842618,261526.9,287375.4,4.743363,3.361803,1.04891,2.121198,0.1668589,0.04836384,0.1836061,0.06177156,0.07216963,0.1363683,0.1843399,0.07247309,0.07404865
min,0.73,217.5,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.5,0.0,0.0001090869,0.0,2012.0,1.0,2012-01-01 00:00:00,1.0,0.0,-79792000.0,-11190510.0,3.271188e-06,-0.9999977,-2700.149,-305698.4,0.00408413,0.003076923,0.0008598452,0.0,6.348213e-08,0.0,0.0,0.009429197,0.0,0.0,0.0,0.0,0.0,130778.3,-0.999952,0.002396645,0.01097303,4175.0,15813.33,-0.9999991,0.01115242,0.001169591,-9990000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,106400.0,130117.2,67.7,78.13,1.0,2.0,3.0,5.0,1.7,29.0,0.954753,0.0,0.1818182,0.0,2015.0,4.0,2015-09-01 00:00:00,2.0,1.0,-19125.0,-7.35,0.6850685,-0.2180396,-0.1045929,-0.459708,0.4863823,0.4,0.1666667,0.0,0.01100113,0.2,1.0,1.306122,0.0,0.0,0.0,0.0,0.0,160228.9,-0.7088451,3.1875,1.284162,141840.0,166851.5,-0.5009737,0.9166667,0.1538462,-1.209686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,179322.0,207670.0,105.64,116.54,4.0,5.0,7.0,15.0,3.0,58.0,0.98,0.125,0.25,0.223784,2018.0,6.0,2018-09-01 00:00:00,2.0,2.0,14521.11,6.56,0.921446,0.009705882,0.07855395,-0.08525074,0.7419355,0.7727273,0.3282443,0.004161712,0.03061224,0.75,1.0,2.333333,0.0,0.0,0.0,0.0,0.0,225078.7,0.0,12.6,3.0,209253.3,235967.0,0.002485501,1.294118,0.3333333,0.08049242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,306800.0,336200.0,171.38,183.13,11.0,12.62492,18.0,39.20478,5.0,108.0,1.0,0.375,0.3217209,0.4444444,2021.0,9.0,2021-07-01 00:00:00,3.0,2.0,67743.47,27.21,1.104593,0.3101398,0.3149315,0.09468912,1.0,1.090909,0.5789474,0.04081633,0.09375,2.157249,2.0,4.0,1.0,0.0,0.0,1.0,0.0,316538.4,2.486486,47.0,6.5,314538.3,341537.0,1.011299,2.5,0.7787182,1.680182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,82450000.0,750000000.0,11190690.0,36500000.0,4579.0,4112.0,7224.0,25592.0,1163.0,41391.0,2.0,1.0,1.0,2.0,2024.0,12.0,2024-06-01 00:00:00,4.0,3.0,749941400.0,36499930.0,2701.149,399999.0,0.9999967,0.9996298,91.13253,89.66667,81.0,1.0,1.0,1163.0,2.0,322.4649,1.0,1.0,1.0,1.0,1.0,613575.8,20809.0,36643.0,855.0,28345020.0,250271000.0,1784999.0,325.0,91.13253,43820640.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,369756.4,989121.5,5468.34,22385.76,56.23782,48.58872,69.70522,197.7235,4.885445,216.6503,0.05956057,0.2932251,0.1508976,0.2852136,3.485378,3.388624,,1.102567,1.100651,968251.9,23037.14,2.570697,525.0672,2.570697,515.0708,3.287625,1.009753,0.6925355,0.2245346,0.1188312,4.467428,0.7579126,4.287335,0.4870483,0.2963565,0.2301649,0.4852594,0.2715416,118275.2,47.43124,189.8635,9.813943,226244.1,573555.5,1427.957,6.149603,2.705063,19552.82,0.3728499,0.2145339,0.3871627,0.2407402,0.2587686,0.3431792,0.3877612,0.2592696,0.2618501


In [51]:
df_copy.sample(10)

Unnamed: 0,median_sale_price,median_list_price,median_ppsf,median_list_ppsf,homes_sold,pending_sales,new_listings,inventory,months_of_supply,median_dom,avg_sale_to_list,sold_above_list,price_drops,off_market_in_two_weeks,year,month,date,quarter,season,price_diff,ppsf_diff,sale_to_list_ratio,price_growth,owner_utility,buyer_utility,pending_sales_ratio,sales_success_rate,inventory_turnover,pct_sold_above_list,price_drop_rate,adjusted_months_supply,political_stance_encoded,supply_to_list_ratio,property_type_All Residential,property_type_Condo/Co-op,property_type_Multi-Family (2-4 Unit),property_type_Single Family Residential,property_type_Townhouse,state_avg_sale_price,metro_region_inventory_change,dom_per_home,inventory_to_pending_ratio,rolling_median_sale_price,rolling_median_list_price,price_momentum,supply_pressure,demand_pressure,price_elasticity,us_region_East North Central,us_region_East South Central,us_region_Mid-Atlantic,us_region_Mountain,us_region_New England,us_region_Pacific,us_region_South Atlantic,us_region_West North Central,us_region_West South Central
1196242,129525.0,155025.0,78.89,83.76,11.0,3.0,16.0,73.0,6.6,146.0,0.948417,0.181818,0.218731,0.0,2015,3,2015-03-01,1,1,25500.0,4.87,0.83551,-0.088654,0.16449,-0.196873,0.1875,0.6875,0.150685,0.016529,0.013671,0.6,2,4.5625,1,0,0,0,0,270850.334042,5.636364,13.272727,24.333333,96362.5,109075.0,0.074339,1.454545,0.041096,-5.380753,0,0,0,0,0,0,1,0,0
1842076,2432000.0,208499.57,536.04,114.62,1.0,20.98989,32.397088,11.0,11.0,202.0,0.805031,0.0,0.090909,0.258844,2016,9,2016-09-01,3,3,-2223500.43,-421.42,11.664293,-0.873016,-10.664293,0.914268,0.647894,0.030867,0.090909,0.0,0.002806,11.0,2,0.339537,0,0,0,1,0,270850.334042,-0.45,202.0,0.524062,974168.0,250982.783333,8.078014,32.397088,1.908172,0.43364,0,0,0,0,0,0,1,0,0
975939,120000.0,123675.0,67.25,74.26,19.0,6.0,18.0,82.0,4.3,113.0,0.971249,0.105263,0.046034,0.0,2014,9,2014-09-01,3,3,3675.0,7.01,0.970285,0.038961,0.029715,-0.030625,0.333333,1.055556,0.231707,0.00554,0.002557,0.226316,0,4.555556,1,0,0,0,0,161783.637946,12.666667,5.947368,13.666667,211000.0,187225.0,1.5,0.947368,0.073171,3.333333,0,1,0,0,0,0,0,0,0
30715,405150.0,583927.0,222.73,355.06,1.0,2.180929,1.0,11.0,11.0,158.0,0.928094,0.0,0.217871,0.160358,2012,2,2012-02-01,1,0,178777.0,132.33,0.693837,-0.339286,0.306163,-0.441261,2.180929,1.0,0.090909,0.0,0.217871,11.0,1,11.0,0,0,0,0,1,330640.499192,-0.3125,158.0,5.043722,471555.666667,468854.666667,2.967119,1.0,0.198266,0.030489,0,0,1,0,0,0,0,0,0
1034432,279375.0,269812.5,122.8,129.51,38.0,30.0,34.0,254.0,6.7,80.0,0.96745,0.078947,0.177165,0.0,2014,11,2014-11-01,4,3,-9562.5,6.71,1.035441,0.13871,-0.035441,0.034228,0.882353,1.117647,0.149606,0.002078,0.005211,0.176316,2,7.470588,0,0,0,1,0,356310.532386,-0.198738,2.105263,8.466667,180375.0,215537.5,1.55137,0.894737,0.11811,0.966887,0,0,0,1,0,0,0,0,0
2057617,55302.0,93522.0,52.79,65.7,2.0,5.039216,1.0,4.0,2.0,4.0,0.990818,0.0,0.245805,0.152818,2017,3,2017-03-01,1,1,38220.0,12.91,0.591326,-0.185057,0.408674,-0.691114,5.039216,2.0,0.5,0.0,0.245805,1.0,2,4.0,0,1,0,0,0,140394.243047,-0.882353,2.0,0.793774,88114.0,175878.3,-0.580473,0.5,1.259804,1.667787,1,0,0,0,0,0,0,0,0
3546408,422300.0,403645.0,130.18,137.78,1.0,1.0,1.0,6.0,6.0,28.0,1.03,1.0,0.333333,0.0,2020,5,2020-05-01,2,1,-18655.0,7.6,1.046216,3.224774,-0.046216,0.044175,1.0,1.0,0.166667,1.0,0.333333,6.0,1,6.0,0,0,0,0,1,207468.54421,-0.435299,28.0,6.0,358066.666667,388721.0,0.337662,1.0,0.166667,-2.591346,0,0,0,0,0,0,0,1,0
4915380,243470.0,302882.5,171.77,190.55,9.0,8.0,26.0,69.0,7.7,73.0,0.961224,0.222222,0.246377,0.25,2023,3,2023-03-01,1,1,59412.5,18.78,0.803843,0.110669,0.196157,-0.244024,0.307692,0.346154,0.130435,0.024691,0.009476,0.855556,0,2.653846,1,0,0,0,0,225078.737874,8.857143,8.111111,8.625,191736.666667,248417.276667,0.767606,2.888889,0.115942,0.0,0,0,0,0,0,0,0,0,1
3620690,35260.0,178675.95,16.21,98.7,1.0,7.597419,9.873379,20.839211,3.24415,97.0,0.861723,0.0,0.283947,0.309367,2020,7,2020-07-01,3,2,143415.95,82.49,0.19734,0.706349,0.80266,-4.067384,0.769485,0.101282,0.047986,0.0,0.028759,3.24415,2,2.110646,1,0,0,0,0,160228.8983,2.473202,97.0,2.742933,308593.333333,357463.283333,-0.888601,9.873379,0.364573,-1.438956,0,0,1,0,0,0,0,0,0
5184305,727500.0,360752.59,264.26,200.38,1.0,4.98427,6.815981,12.92043,2.434839,6.0,1.034483,1.0,0.224894,0.356683,2023,10,2023-10-01,4,3,-366747.41,-63.88,2.016618,0.363636,-1.016618,0.50412,0.731262,0.146714,0.077397,1.0,0.032995,2.434839,1,1.895608,0,0,1,0,0,359261.766343,5.460215,6.0,2.592241,360516.666667,291585.196667,4.769231,6.815981,0.385767,0.835411,0,0,0,0,1,0,0,0,0


In [53]:
df_copy.to_csv('checkpoint4.csv')