In [1]:
from pathlib import Path
import os

PROJECT_ROOT = Path(r"C:\Users\joshu\OneDrive\Desktop\Projects and Analysis 2\crime-housing-affordability")
os.chdir(PROJECT_ROOT)


In [2]:
import pandas as pd
import numpy as np


In [3]:
zhvi = pd.read_csv("data/raw/zillow/zillow_zhvi_sfr_midtier_metro.csv")
zori = pd.read_csv("data/raw/zillow/zillow_zori_rent_metro.csv")


In [4]:
print("ZHVI shape:", zhvi.shape)
print("ZORI shape:", zori.shape)


ZHVI shape: (895, 316)
ZORI shape: (556, 136)


In [5]:
zhvi.columns


Index(['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
       '2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30', '2000-05-31',
       ...
       '2025-02-28', '2025-03-31', '2025-04-30', '2025-05-31', '2025-06-30',
       '2025-07-31', '2025-08-31', '2025-09-30', '2025-10-31', '2025-11-30'],
      dtype='object', length=316)

## Zillow ZHVI – Structure Notes

- One row represents: One metropolitan region
- Identifier columns (non-date): RegionID, RegionName, RegionType, StateName, SizeRank
- Date columns look like: YYYY-MM-DD (month-end)
- Data is currently: WIDE
- Eventual join key (not yet used): Metro (RegionID → CBSA later)


In [6]:
zori.columns


Index(['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName',
       '2015-01-31', '2015-02-28', '2015-03-31', '2015-04-30', '2015-05-31',
       ...
       '2025-02-28', '2025-03-31', '2025-04-30', '2025-05-31', '2025-06-30',
       '2025-07-31', '2025-08-31', '2025-09-30', '2025-10-31', '2025-11-30'],
      dtype='object', length=136)

## Zillow ZORI – Structure Notes

- One row represents: ______________________
- Identifier columns (non-date): ______________________
- Date columns look like: ______________________
- Data is currently: WIDE / LONG
- Structure compared to ZHVI: SAME / DIFFERENT


In [7]:
# ==============================
# Step 1: Define identifier columns
# ==============================

id_cols = [
    "RegionID",
    "RegionName",
    "RegionType",
    "StateName",
    "SizeRank"
]

id_cols


['RegionID', 'RegionName', 'RegionType', 'StateName', 'SizeRank']

In [8]:
# ==============================
# Step 2: Melt ZHVI to long format
# ==============================

zhvi_long = zhvi.melt(
    id_vars=id_cols,
    var_name="date",
    value_name="home_value"
)

print("ZHVI long shape:", zhvi_long.shape)
zhvi_long.head()


ZHVI long shape: (278345, 7)


Unnamed: 0,RegionID,RegionName,RegionType,StateName,SizeRank,date,home_value
0,102001,United States,country,,0,2000-01-31,121809.989633
1,394913,"New York, NY",msa,NY,1,2000-01-31,217228.516929
2,753899,"Los Angeles, CA",msa,CA,2,2000-01-31,229483.171834
3,394463,"Chicago, IL",msa,IL,3,2000-01-31,156920.770342
4,394514,"Dallas, TX",msa,TX,4,2000-01-31,128049.299682


In [9]:
# ==============================
# Step 3: Melt ZORI to long format
# ==============================

zori_long = zori.melt(
    id_vars=id_cols,
    var_name="date",
    value_name="rent"
)

print("ZORI long shape:", zori_long.shape)
zori_long.head()


ZORI long shape: (72836, 7)


Unnamed: 0,RegionID,RegionName,RegionType,StateName,SizeRank,date,rent
0,102001,United States,country,,0,2015-01-31,1230.692355
1,394913,"New York, NY",msa,NY,1,2015-01-31,2040.030108
2,753899,"Los Angeles, CA",msa,CA,2,2015-01-31,2451.728156
3,394463,"Chicago, IL",msa,IL,3,2015-01-31,1425.440274
4,394514,"Dallas, TX",msa,TX,4,2015-01-31,1365.612616


In [10]:
# ==============================
# Step 4: Convert date column to datetime
# ==============================

zhvi_long["date"] = pd.to_datetime(zhvi_long["date"])
zori_long["date"] = pd.to_datetime(zori_long["date"])

zhvi_long.dtypes


RegionID               int64
RegionName            object
RegionType            object
StateName             object
SizeRank               int64
date          datetime64[ns]
home_value           float64
dtype: object

In [12]:
latest_date_zori = zori_long["date"].max()
latest_date_zori


Timestamp('2025-11-30 00:00:00')

In [11]:
latest_date_zhvi = zhvi_long["date"].max()
latest_date_zhvi


Timestamp('2025-11-30 00:00:00')

In [13]:
zhvi_latest = zhvi_long[zhvi_long["date"] == latest_date_zhvi].copy()
zori_latest = zori_long[zori_long["date"] == latest_date_zori].copy()

print("ZHVI latest shape:", zhvi_latest.shape)
print("ZORI latest shape:", zori_latest.shape)


ZHVI latest shape: (895, 7)
ZORI latest shape: (556, 7)
