### Preprocessing and Cleaning for hud.csv

#### Import Libraries

In [1]:
import warnings
import numpy as np
import pandas as pd


warnings.filterwarnings("ignore")

In [2]:
hud_df = pd.read_csv('../data/hud/hud_yearly.csv')

In [3]:
hud_df

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Change,YoY,2Y Change,Yo2Y,3Y Change,Yo3Y,4Y Change,Yo4Y
0,Phoenix,AZ,studio,2019,847,103,13.84,,,,,,
1,Los Angeles,CA,studio,2019,1279,121,10.45,,,,,,
2,Sacramento,CA,studio,2019,952,99,11.61,,,,,,
3,San Francisco,CA,studio,2019,2197,375,20.58,,,,,,
4,Chicago,IL,studio,2019,956,41,4.48,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Boston,MA,4,2022,3540,287,8.82,372.0,11.74,409.0,13.06,969.0,37.69
196,New York,NY,4,2022,3316,143,4.51,532.0,19.11,673.0,25.46,841.0,33.98
197,Nashville,TN,4,2022,2173,198,10.03,243.0,12.59,351.0,19.26,435.0,25.03
198,Austin,TX,4,2022,2416,222,10.12,209.0,9.47,288.0,13.53,317.0,15.10


In [4]:
hud_df.sample(5, random_state=24)

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Change,YoY,2Y Change,Yo2Y,3Y Change,Yo3Y,4Y Change,Yo4Y
197,Nashville,TN,4,2022,2173,198,10.03,243.0,12.59,351.0,19.26,435.0,25.03
109,Seattle,WA,2,2021,2044,138,7.24,-55.0,-2.62,145.0,7.64,,
101,Los Angeles,CA,2,2021,2044,-14,-0.68,88.0,4.5,253.0,14.13,,
125,Boston,MA,3,2019,2880,497,20.86,,,,,,
48,Austin,TX,1,2019,1134,48,4.42,,,,,,


###### Reshape data from wide to long format

In [5]:
hud_df.rename(columns={'Change': '1Y', 
                       '2Y Change': '2Y', 
                       '3Y Change': '3Y', 
                       '4Y Change': '4Y'}, 
              inplace=True)

In [6]:
# reshape data
hud_df = pd.melt(hud_df,
                 id_vars=['City', 'State', 'Bedrooms', 'Year', 'Avg Rent',
                          'YoY', 'Yo2Y', 'Yo3Y', 'Yo4Y'],
                 var_name='Yearly Difference', 
                 value_vars=['1Y', '2Y', '3Y', '4Y'],
                 value_name='Price Change')

hud_df = pd.melt(hud_df,
                 id_vars=['City', 'State', 'Bedrooms', 'Year', 'Avg Rent',
                          'Yearly Difference', 'Price Change'],
                 var_name='Growth',
                 value_vars=['YoY', 'Yo2Y', 'Yo3Y', 'Yo4Y'],
                 value_name='Percent Change')

In [7]:
hud_df.sample(5, random_state=24)

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Yearly Difference,Price Change,Growth,Percent Change
1988,Austin,TX,4,2021,2194,2Y,66.0,Yo3Y,4.53
1623,San Francisco,CA,studio,2021,2115,1Y,-235.0,Yo3Y,16.08
2125,Boston,MA,3,2019,2880,3Y,,Yo3Y,
3121,Los Angeles,CA,3,2019,2614,4Y,,Yo4Y,
2661,Los Angeles,CA,1,2021,1604,2Y,87.0,Yo4Y,


###### Drop missing and duplicate values
1. Drop missing values
2. Identify duplicate values and remove them from dataset

In [8]:
# Drop missing and duplicate values
hud_df.dropna(inplace=True)
hud_df.drop_duplicates(inplace=True)

Create new column **is_duplicate** to flag duplicate values in **Yearly Difference** and **Growth** columns. 
Duplicates values are those where values for **Yearly Difference** and **Growth** don't match, i.e. **4Y** and **Yo2Y**.

In [9]:
# Create "is_duplicate" column flagging duplicate values

conditions = [
    (hud_df['Yearly Difference'] == '1Y') & (hud_df['Growth'] != 'YoY'), 
    (hud_df['Yearly Difference'] == '2Y') & (hud_df['Growth'] != 'Yo2Y'),
    (hud_df['Yearly Difference'] == '3Y') & (hud_df['Growth'] != 'Yo3Y'), 
    (hud_df['Yearly Difference'] == '4Y') & (hud_df['Growth'] != 'Yo4Y')
    ]
              
values = ['Yes', 
          'Yes', 
          'Yes', 
          'Yes']

hud_df['is_duplicate'] = np.select(conditions, values, default="No")

In [10]:
hud_df.sample(5, random_state=24)

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Yearly Difference,Price Change,Growth,Percent Change,is_duplicate
891,Los Angeles,CA,2,2020,2058,1Y,102.0,Yo2Y,14.91,Yes
1435,Boston,MA,studio,2022,2025,4Y,631.0,Yo2Y,16.25,Yes
420,Phoenix,AZ,studio,2021,1005,3Y,261.0,YoY,7.72,Yes
2871,Los Angeles,CA,1,2022,1747,3Y,230.0,Yo4Y,26.23,Yes
2957,Nashville,TN,3,2022,1758,3Y,274.0,Yo4Y,20.82,Yes


In [11]:
# Remove duplicate values from dataset
hud_df = hud_df[hud_df['is_duplicate'] == 'No']

In [12]:
hud_df

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Yearly Difference,Price Change,Growth,Percent Change,is_duplicate
0,Phoenix,AZ,studio,2019,847,1Y,103.0,YoY,13.84,No
1,Los Angeles,CA,studio,2019,1279,1Y,121.0,YoY,10.45,No
2,Sacramento,CA,studio,2019,952,1Y,99.0,YoY,11.61,No
3,San Francisco,CA,studio,2019,2197,1Y,375.0,YoY,20.58,No
4,Chicago,IL,studio,2019,956,1Y,41.0,YoY,4.48,No
...,...,...,...,...,...,...,...,...,...,...
3195,Boston,MA,4,2022,3540,4Y,969.0,Yo4Y,37.69,No
3196,New York,NY,4,2022,3316,4Y,841.0,Yo4Y,33.98,No
3197,Nashville,TN,4,2022,2173,4Y,435.0,Yo4Y,25.03,No
3198,Austin,TX,4,2022,2416,4Y,317.0,Yo4Y,15.10,No


In [13]:
hud_df.reset_index(drop=True, inplace=True)

In [14]:
hud_df.sample(5, random_state=24)

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Yearly Difference,Price Change,Growth,Percent Change,is_duplicate
421,Los Angeles,CA,3,2022,2888,3Y,274.0,Yo3Y,10.48,No
105,Boston,MA,2,2021,2399,1Y,63.0,YoY,2.7,No
222,Sacramento,CA,studio,2022,1277,2Y,217.0,Yo2Y,20.47,No
119,Seattle,WA,2,2022,2199,1Y,155.0,YoY,7.58,No
347,Nashville,TN,4,2022,2173,2Y,243.0,Yo2Y,12.59,No


###### Preprocess dataset by:
1. Creating **Time Range** column
2. Converting numbers in **Price Change** from float to int
3. Removing columns that are unnecessary for analysis

In [15]:
# Identify time ranges and create "Time Range" column

conditions = [
    (hud_df['Year'] == 2019) & (hud_df['Yearly Difference'] == '1Y') & (hud_df['Growth'] == 'YoY'), 
    (hud_df['Year'] == 2020) & (hud_df['Yearly Difference'] == '1Y') & (hud_df['Growth'] == 'YoY'),
    (hud_df['Year'] == 2020) & (hud_df['Yearly Difference'] == '2Y') & (hud_df['Growth'] == 'Yo2Y'), 
    (hud_df['Year'] == 2021) & (hud_df['Yearly Difference'] == '1Y') & (hud_df['Growth'] == 'YoY'),
    (hud_df['Year'] == 2021) & (hud_df['Yearly Difference'] == '2Y') & (hud_df['Growth'] == 'Yo2Y'),
    (hud_df['Year'] == 2021) & (hud_df['Yearly Difference'] == '3Y') & (hud_df['Growth'] == 'Yo3Y'),
    (hud_df['Year'] == 2022) & (hud_df['Yearly Difference'] == '1Y') & (hud_df['Growth'] == 'YoY'),
    (hud_df['Year'] == 2022) & (hud_df['Yearly Difference'] == '2Y') & (hud_df['Growth'] == 'Yo2Y'),
    (hud_df['Year'] == 2022) & (hud_df['Yearly Difference'] == '3Y') & (hud_df['Growth'] == 'Yo3Y'),
    (hud_df['Year'] == 2022) & (hud_df['Yearly Difference'] == '4Y') & (hud_df['Growth'] == 'Yo4Y')
    ]
              
values = ['2018 to 2019', 
          '2019 to 2020', 
          '2018 to 2020', 
          '2020 to 2021',
          '2019 to 2021',
          '2018 to 2021',
          '2021 to 2022',
          '2020 to 2022',
          '2019 to 2022',
          '2018 to 2022']

hud_df['Time Range'] = np.select(conditions, values, default="Unknown")

In [16]:
hud_df.sample(5, random_state=24)

Unnamed: 0,City,State,Bedrooms,Year,Avg Rent,Yearly Difference,Price Change,Growth,Percent Change,is_duplicate,Time Range
421,Los Angeles,CA,3,2022,2888,3Y,274.0,Yo3Y,10.48,No,2019 to 2022
105,Boston,MA,2,2021,2399,1Y,63.0,YoY,2.7,No,2020 to 2021
222,Sacramento,CA,studio,2022,1277,2Y,217.0,Yo2Y,20.47,No,2020 to 2022
119,Seattle,WA,2,2022,2199,1Y,155.0,YoY,7.58,No,2021 to 2022
347,Nashville,TN,4,2022,2173,2Y,243.0,Yo2Y,12.59,No,2020 to 2022


In [17]:
# Convert numbers from float to int
hud_df['Price Change'] = hud_df['Price Change'].astype('int')

In [18]:
# Remove unnecessary columns

hud_df = hud_df[['City', 'State', 'Bedrooms', 'Year', 'Time Range', 
                 'Growth', 'Avg Rent', 'Price Change', 'Percent Change']]

In [19]:
hud_df

Unnamed: 0,City,State,Bedrooms,Year,Time Range,Growth,Avg Rent,Price Change,Percent Change
0,Phoenix,AZ,studio,2019,2018 to 2019,YoY,847,103,13.84
1,Los Angeles,CA,studio,2019,2018 to 2019,YoY,1279,121,10.45
2,Sacramento,CA,studio,2019,2018 to 2019,YoY,952,99,11.61
3,San Francisco,CA,studio,2019,2018 to 2019,YoY,2197,375,20.58
4,Chicago,IL,studio,2019,2018 to 2019,YoY,956,41,4.48
...,...,...,...,...,...,...,...,...,...
495,Boston,MA,4,2022,2018 to 2022,Yo4Y,3540,969,37.69
496,New York,NY,4,2022,2018 to 2022,Yo4Y,3316,841,33.98
497,Nashville,TN,4,2022,2018 to 2022,Yo4Y,2173,435,25.03
498,Austin,TX,4,2022,2018 to 2022,Yo4Y,2416,317,15.10


In [None]:
hud_df.to_csv('/data/hud/hud_final.csv', index=False)