In [4]:
# Imports

import pandas as pd
import numpy as np
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.model_selection import train_test_split

from wrangle import acquire_zillow, get_zillow_data, wrangle_zillow, split_train_val_test

from explore import plot_variable_pairs, plot_categorical_and_continuous_vars

import os

In [9]:
df = get_zillow_data()
df = wrangle_zillow(df)
df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,fips
4,4,2.0,3633,296425,2005,6941.39,6037
6,3,4.0,1620,847770,2011,10244.94,6037
7,3,2.0,2077,646760,1926,7924.68,6037
11,0,0.0,1200,5328,1972,91.6,6037
14,0,0.0,171,6920,1973,255.17,6037


In [11]:
# Assuming df is your original DataFrame
df['total_rooms'] = df['bedrooms'] + df['bathrooms']

current_year = datetime.datetime.now().year
df['property_age'] = current_year - df['year_built']

df['price_per_sqft'] = df['tax_value'] / df['area']

df['tax_rate'] = df['taxamount'] / df['tax_value']

df = pd.get_dummies(df, columns=['fips'], prefix='county')

df['size_per_bedroom'] = df['area'] / df['bedrooms']

df['bathroom_to_bedroom_ratio'] = df['bathrooms'] / df['bedrooms']

df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,total_rooms,property_age,price_per_sqft,tax_rate,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio
4,4,2.0,3633,296425,2005,6941.39,6.0,18,81.592348,0.023417,True,False,False,908.25,0.5
6,3,4.0,1620,847770,2011,10244.94,7.0,12,523.314815,0.012085,True,False,False,540.0,1.333333
7,3,2.0,2077,646760,1926,7924.68,5.0,97,311.39143,0.012253,True,False,False,692.333333,0.666667
11,0,0.0,1200,5328,1972,91.6,0.0,51,4.44,0.017192,True,False,False,inf,
14,0,0.0,171,6920,1973,255.17,0.0,50,40.467836,0.036874,True,False,False,inf,


In [12]:
property_age_mean = df['property_age'].mean()
property_age_std = df['property_age'].std()

# Add corrected age range information for each category
age_ranges = {
    'very_new': f'<= {int(property_age_mean - 2 * property_age_std)} years',
    'new': f'{int(property_age_mean - 2 * property_age_std) + 1} - {int(property_age_mean - property_age_std)} years',
    'mid-aged': f'{int(property_age_mean - property_age_std) + 1} - {int(property_age_mean + property_age_std)} years',
    'old': f'{int(property_age_mean + property_age_std) + 1} - {int(property_age_mean + 2 * property_age_std)} years',
    'very_old': f'>{int(property_age_mean + 2 * property_age_std)} years'
}
age_ranges

{'very_new': '<= 17 years',
 'new': '18 - 39 years',
 'mid-aged': '40 - 84 years',
 'old': '85 - 106 years',
 'very_old': '>106 years'}

In [13]:
bins = [-float('inf'), property_age_mean - 2 * property_age_std, property_age_mean - property_age_std, property_age_mean + property_age_std, property_age_mean + 2 * property_age_std, float('inf')]

labels = ['very_new', 'new', 'mid-aged', 'old', 'very_old']

df['property_age_group'] = pd.cut(df['property_age'], bins=bins, labels=labels)

df.head()

Unnamed: 0,bedrooms,bathrooms,area,tax_value,year_built,taxamount,total_rooms,property_age,price_per_sqft,tax_rate,county_6037,county_6059,county_6111,size_per_bedroom,bathroom_to_bedroom_ratio,property_age_group
4,4,2.0,3633,296425,2005,6941.39,6.0,18,81.592348,0.023417,True,False,False,908.25,0.5,new
6,3,4.0,1620,847770,2011,10244.94,7.0,12,523.314815,0.012085,True,False,False,540.0,1.333333,very_new
7,3,2.0,2077,646760,1926,7924.68,5.0,97,311.39143,0.012253,True,False,False,692.333333,0.666667,old
11,0,0.0,1200,5328,1972,91.6,0.0,51,4.44,0.017192,True,False,False,inf,,mid-aged
14,0,0.0,171,6920,1973,255.17,0.0,50,40.467836,0.036874,True,False,False,inf,,mid-aged
