In [148]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import pgeocode
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import os
from datetime import datetime

In [149]:
# Read HouseTS.csv into area_df
area_df = pd.read_csv('../raw_data/HouseTS.csv')

# Read realtor-data.csv into house_df
house_df = pd.read_csv('../raw_data/realtor-data.csv')

In [150]:
# Create list of unique zipcodes in area_df
unique_zipcodes_area_df = area_df['zipcode'].unique().tolist()

# Filter house_df by unique_zipcoes_area_df
house_df = house_df[house_df['zip_code'].isin(unique_zipcodes_area_df)]

In [151]:
len(unique_zipcodes_area_df)

6226

In [152]:
# # Create LTM_area_df
# LTM_area_df = area_df[area_df['date'] == '2023-12-31'].copy()
# LTM_area_df = LTM_area_df[['zipcode', 'Per Capita Income', 'Median Rent', 'Median Home Value', 'Median Age', 'park', 'school']]

# # Rename zipcode to zip_code in LTM_area_df
# LTM_area_df = LTM_area_df.rename(columns={'zipcode': 'zip_code', 'Per Capita Income': 'p_c_income', 'Median Rent': 'median_rent', 'Median Home Value': 'median_home_value', 'Median Age': 'median_age'})

# # Merge
# merged_df = house_df.merge(LTM_area_df, on='zip_code', how='left')

In [None]:
def clean_data(df):
    # Drop columns 'brokered_by', 'status'
    df = df.drop(columns=['brokered_by', 'status'])

     # Drop duplicates
    df = df.drop_duplicates()

    # Drop columns 'street', 'city', 'state' and 'prev_sold_date'
    df = df.drop(columns=['street', 'city', 'state', 'prev_sold_date'])

    # Drop rows with NaN values from 'price'
    df = df.dropna(subset=['price'])

    # Create list where 'bed' & 'bath' & 'house_size' are NaN
    nan_values = df[
        (pd.isna(df['bed'])) &
        (pd.isna(df['bath'])) &
        (pd.isna(df['house_size']))
    ]

    # Filter out rows that are in nan_values because we assume they are land sales
    df = df[~df.index.isin(nan_values.index)]

    # Impute missing data
    df['bed'] = df['bed'].fillna(df['bed'].median())
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['house_size'] = df['house_size'].fillna(df['house_size'].median())
    df['acre_lot'] = df['acre_lot'].fillna(0)

    # Calculate PPSF for each row
    df['ppsf'] = df['price'] / df['house_size']

    # Calculate median PPSF per zip_code
    ppsf_median = df.groupby('zip_code')['ppsf'].median().reset_index(name='ppsf_zipcode')

    # Merge median PPSF back to df
    df = df.merge(ppsf_median, on='zip_code', how='left')

    # Drop temporary ppsf column
    df = df.drop(columns=['ppsf'])

    # Convert zipcode to int
    df['zip_code'] = df['zip_code'].astype(int)

    # Calculate boundaries for 'price', 'acre_lot', 'house_size', 'ppsf_zipcode'
    lower_price = df['price'].quantile(0.03)
    upper_price = df['price'].quantile(0.97)
    upper_house_size = df['house_size'].quantile(0.99)
    lower_acre_lot = df['acre_lot'].quantile(0.01)
    upper_acre_lot = df['acre_lot'].quantile(0.99)
    lower_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.03)
    upper_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.97)

    # Apply boundaries to df
    df = df[
        (df['price'] > lower_price) &
        (df['price'] < upper_price) &
        (df['bed'] < 14) &
        (df['bath'] < 12) &
        (df['house_size'] < upper_house_size) &
        (df['acre_lot'] > lower_acre_lot) &
        (df['acre_lot'] < upper_acre_lot) &
        (df['ppsf_zipcode'] > lower_ppsf_zipcode) &
        (df['ppsf_zipcode'] < upper_ppsf_zipcode)
    ]

    return df

In [154]:
# Clean df
cleaned_house_df = clean_data(house_df)


In [155]:
print(cleaned_house_df.columns.tolist())


['price', 'bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'ppsf_zipcode']


In [156]:
dict = create_zip_dict(cleaned_house_df)


In [157]:
dict

{8016: 178.7231365218356,
 1431: 191.02209944751382,
 1432: 271.95027195027194,
 1749: 274.6042670337233,
 1776: 339.816991028691,
 1450: 252.58926758520667,
 1464: 226.7163830818121,
 1854: 254.4921119731245,
 1863: 280.910971293658,
 1886: 304.8907388137357,
 1752: 252.442996742671,
 1852: 262.77602523659306,
 1824: 293.3130699088146,
 1826: 269.00224215246635,
 1775: 296.92737430167597,
 1460: 291.8679549114332,
 1463: 223.3203716448727,
 1851: 259.1188968301776,
 1720: 302.8503562945368,
 1879: 237.66233766233765,
 1850: 226.34936773752565,
 1754: 263.498501420731,
 1827: 243.80576853045176,
 1742: 444.0348525469169,
 1741: 331.9831141370095,
 1469: 208.33333333333334,
 1719: 331.0477988568685,
 1474: 203.89369592089,
 1862: 302.7252081756245,
 1821: 312.9337107377648,
 2493: 513.8778470133218,
 2482: 602.0253164556962,
 1803: 375.7657716948165,
 3053: 270.13138594540334,
 1778: 381.12449799196787,
 1721: 288.9261744966443,
 1701: 324.64001782531193,
 1702: 301.47058823529414,
 174

In [159]:
# Work on a copy to avoid SettingWithCopyWarning
df = cleaned_house_df.copy()
df = df.drop(columns=['zip_code'])

# Define features and target
target = 'price'
features = [col for col in df.columns if col != target]  # Exclude price


# Verify columns
print("\nFeatures:", features)
if target not in df.columns:
    raise ValueError(f"'{target}' column not found. Available columns: {df.columns.tolist()}")

# Create X and y
X = df[features]
y = df[target]

# Preprocess with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features)
    ]
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1))
])

# Train model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nModel Results:")
print(f"XGBoost RMSE: ${rmse:,.2f}")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
feature_names = features
print("\nFeature Importance:")
for name, importance in zip(feature_names, pipeline.named_steps['regressor'].feature_importances_):
    print(f"{name}: {importance:.4f}")

# Sample of actual vs. predicted prices
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred,
    'Difference': y_test - y_pred
})
print("\nSample of Actual vs. Predicted Prices:")
print(results_df.head(10))



Features: ['bed', 'bath', 'acre_lot', 'house_size', 'ppsf_zipcode']

Model Results:
XGBoost RMSE: $163,254.46
Mean Absolute Error (MAE): $98,556.45
R² Score: 0.8156

Feature Importance:
bed: 0.0160
bath: 0.0968
acre_lot: 0.0202
house_size: 0.3750
ppsf_zipcode: 0.4920

Sample of Actual vs. Predicted Prices:
        Actual Price  Predicted Price    Difference
524430      375000.0     471724.31250  -96724.31250
711473      342000.0     402451.31250  -60451.31250
730429      510000.0     460737.18750   49262.81250
149804     1675000.0     766226.25000  908773.75000
74707       475000.0     508209.68750  -33209.68750
625130      289500.0     373066.68750  -83566.68750
79687       699999.0     990039.37500 -290040.37500
532139      770000.0     718331.18750   51668.81250
391375      549000.0     500012.56250   48987.43750
271014      190000.0     185125.15625    4874.84375


In [160]:
# Define model save path
model_dir = "/Users/carlokrups/code/JensKlug/zillows_real_estate/zillow/model/models"
os.makedirs(model_dir, exist_ok=True)

# Timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
model_path = os.path.join(model_dir, f"{timestamp}_xgboost_pipeline.joblib")

# Save pipeline
joblib.dump(pipeline, model_path)

print(f"\n✅ Model saved to: {model_path}")


✅ Model saved to: /Users/carlokrups/code/JensKlug/zillows_real_estate/zillow/model/models/20250612-131146_xgboost_pipeline.joblib


In [145]:
def load_data():
    base_dir = os.path.dirname(os.path.abspath(__file__))  # .../zillow/ml_logic
    raw_data_dir = os.path.abspath(os.path.join(base_dir, '..', '..', 'raw_data'))

    area_path = os.path.join(raw_data_dir, 'HouseTS.csv')
    house_path = os.path.join(raw_data_dir, 'realtor-data.csv')

    area_df = pd.read_csv(area_path)
    house_df = pd.read_csv(house_path)

    unique_zipcodes_area_df = area_df['zipcode'].unique().tolist()
    house_df = house_df[house_df['zip_code'].isin(unique_zipcodes_area_df)]

    return clean_data(house_df)



def clean_data(df):
    # Drop columns 'brokered_by', 'status'
    df = df.drop(columns=['brokered_by', 'status'])

     # Drop duplicates
    df = df.drop_duplicates()

    # Drop columns 'street', 'city', 'state' and 'prev_sold_date'
    df = df.drop(columns=['street', 'city', 'state', 'prev_sold_date'])

    # Drop rows with NaN values from 'price'
    df = df.dropna(subset=['price'])

    # Create list where 'bed' & 'bath' & 'house_size' are NaN
    nan_values = df[
        (pd.isna(df['bed'])) &
        (pd.isna(df['bath'])) &
        (pd.isna(df['house_size']))
    ]

    # Filter out rows that are in nan_values because we assume they are land sales
    df = df[~df.index.isin(nan_values.index)]

    # Impute missing data
    df['bed'] = df['bed'].fillna(df['bed'].median())
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['house_size'] = df['house_size'].fillna(df['house_size'].median())
    df['acre_lot'] = df['acre_lot'].fillna(0)

    # Step 2: Calculate PPSF for each row
    df['ppsf'] = round(df['price'] / df['house_size'], 2)

    # Step 3: Calculate median PPSF per zip_code
    ppsf_median = df.groupby('zip_code')['ppsf'].median().reset_index(name='ppsf_zipcode')

    # Step 4: Merge median PPSF back to df
    df = df.merge(ppsf_median, on='zip_code', how='left')

    # Drop temporary ppsf column
    df = df.drop(columns=['ppsf'])

    # Calculate boundaries for 'price', 'acre_lot', 'house_size', 'ppsf_zipcode'
    lower_price = df['price'].quantile(0.03)
    upper_price = df['price'].quantile(0.97)
    upper_house_size = df['house_size'].quantile(0.99)
    lower_acre_lot = df['acre_lot'].quantile(0.01)
    upper_acre_lot = df['acre_lot'].quantile(0.99)
    lower_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.03)
    upper_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.97)

    # Apply boundaries to df
    df = df[
        (df['price'] > lower_price) &
        (df['price'] < upper_price) &
        (df['bed'] < 14) &
        (df['bath'] < 12) &
        (df['house_size'] < upper_house_size) &
        (df['acre_lot'] > lower_acre_lot) &
        (df['acre_lot'] < upper_acre_lot) &
        (df['ppsf_zipcode'] > lower_ppsf_zipcode) &
        (df['ppsf_zipcode'] < upper_ppsf_zipcode)
        ]

    return df

def create_zip_dict(df):
    """
    Create a dictionary mapping zip codes to a single ppsf_zipcode value.
    Example: {12345: 210.5, 67890: 198.3}
    """
    zip_dict = (
        df.drop_duplicates(subset="zip_code")
          .set_index("zip_code")[["ppsf_zipcode"]]
          .to_dict(orient="index")
    )

    zip_dict = {
        zip_code: values["ppsf_zipcode"]  # remove the [ ] list wrapping
        for zip_code, values in zip_dict.items()
    }

    return zip_dict
