In [372]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import pgeocode
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import os
from datetime import datetime

In [373]:
# Read HouseTS.csv into area_df
area_df = pd.read_csv('../raw_data/HouseTS.csv')

# Read realtor-data.csv into house_df
house_df = pd.read_csv('../raw_data/realtor-data.csv')

In [374]:
# Create list of unique zipcodes in area_df
unique_zipcodes_area_df = area_df['zipcode'].unique().tolist()

# Filter house_df by unique_zipcoes_area_df
house_df = house_df[house_df['zip_code'].isin(unique_zipcodes_area_df)]

In [375]:
def clean_data(df):
    # Drop columns 'brokered_by', 'status'
    df = df.drop(columns=['brokered_by', 'status'])

     # Drop duplicates
    df = df.drop_duplicates()

    # Drop columns 'street', 'city', 'state' and 'prev_sold_date'
    df = df.drop(columns=['street', 'city', 'state', 'prev_sold_date'])

    # Drop rows with NaN values from 'price'
    df = df.dropna(subset=['price'])

    # Create list where 'bed' & 'bath' & 'house_size' are NaN
    nan_values = df[
        (pd.isna(df['bed'])) &
        (pd.isna(df['bath'])) &
        (pd.isna(df['house_size']))
    ]

    # Filter out rows that are in nan_values because we assume they are land sales
    df = df[~df.index.isin(nan_values.index)]

    # Impute missing data
    df['bed'] = df['bed'].fillna(df['bed'].median())
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['house_size'] = df['house_size'].fillna(df['house_size'].median())
    df['acre_lot'] = df['acre_lot'].fillna(0)

    # Step 2: Calculate PPSF for each row
    df['ppsf'] = df['price'] / df['house_size']

    # Step 3: Calculate median PPSF per zip_code
    ppsf_median = df.groupby('zip_code')['ppsf'].median().reset_index(name='ppsf_zipcode')

    # Step 4: Merge median PPSF back to df
    df = df.merge(ppsf_median, on='zip_code', how='left')

    # Drop temporary ppsf column
    df = df.drop(columns=['ppsf'])

    # Calculate boundaries for 'price', 'acre_lot', 'house_size', 'ppsf_zipcode'
    lower_price = df['price'].quantile(0.03)
    upper_price = df['price'].quantile(0.97)
    upper_house_size = df['house_size'].quantile(0.99)
    lower_acre_lot = df['acre_lot'].quantile(0.01)
    upper_acre_lot = df['acre_lot'].quantile(0.99)
    lower_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.03)
    upper_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.97)

    # Apply boundaries to df
    df = df[
        (df['price'] > lower_price) &
        (df['price'] < upper_price) &
        (df['bed'] < 14) &
        (df['bath'] < 12) &
        (df['house_size'] < upper_house_size) &
        (df['acre_lot'] > lower_acre_lot) &
        (df['acre_lot'] < upper_acre_lot) &
        (df['ppsf_zipcode'] > lower_ppsf_zipcode) &
        (df['ppsf_zipcode'] < upper_ppsf_zipcode)
        ]

    return df

In [376]:
# Clean df
cleaned_house_df = clean_data(house_df)

In [377]:
cleaned_house_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 608684 entries, 0 to 854410
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         608684 non-null  float64
 1   bed           608684 non-null  float64
 2   bath          608684 non-null  float64
 3   acre_lot      608684 non-null  float64
 4   zip_code      608684 non-null  float64
 5   house_size    608684 non-null  float64
 6   ppsf_zipcode  608684 non-null  float64
dtypes: float64(7)
memory usage: 37.2 MB


In [363]:
def convert_zipcode(df):
    # Convert zip_code column to 5-digit string
    df['zip_code'] = df['zip_code'].astype(str).str.replace('\.0$', '', regex=True).str.zfill(5)

    # Get unique zip codes
    unique_zips = df['zip_code'].unique()

    # Initialize pgeocode for US
    nomi = pgeocode.Nominatim('us')

    # Function to get coordinates
    def get_coordinates(zip_code):
        try:
            result = nomi.query_postal_code(zip_code)
            if result.empty or pd.isna(result.latitude):
                return pd.Series([None, None])
            return pd.Series([result.latitude, result.longitude])
        except:
            return pd.Series([None, None])

    # Create DataFrame for unique zip codes
    zip_coords = pd.DataFrame(unique_zips, columns=['zip_code'])
    zip_coords[['latitude', 'longitude']] = zip_coords.apply(lambda row: get_coordinates(row['zip_code']), axis=1)

    # Map coordinates back to filtered_house_df
    coords_dict = zip_coords.set_index('zip_code')[['latitude', 'longitude']].to_dict('index')
    df['latitude'] = df['zip_code'].map(lambda x: coords_dict.get(x, {}).get('latitude'))
    df['longitude'] = df['zip_code'].map(lambda x: coords_dict.get(x, {}).get('longitude'))

    # Drop 'zip_code' column
    df = df.drop(columns=['zip_code'])

    return df

In [378]:
# Convert zipcodes to coordinates
#cleaned_house_df = convert_zipcode(cleaned_house_df)
#cleaned_house_df

In [379]:
cleaned_house_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 608684 entries, 0 to 854410
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   price         608684 non-null  float64
 1   bed           608684 non-null  float64
 2   bath          608684 non-null  float64
 3   acre_lot      608684 non-null  float64
 4   zip_code      608684 non-null  float64
 5   house_size    608684 non-null  float64
 6   ppsf_zipcode  608684 non-null  float64
dtypes: float64(7)
memory usage: 37.2 MB


In [383]:
# Work on a copy to avoid SettingWithCopyWarning
df = cleaned_house_df.copy()

# Define features and target
target = 'price'
features = [col for col in df.columns if col != target]  # Exclude price
numeric_features = [col for col in features if col != 'zip_code']  # Exclude zip_code

# Verify columns
print("\nFeatures:", features)
print("Numeric features for scaling:", numeric_features)
if target not in df.columns:
    raise ValueError(f"'{target}' column not found. Available columns: {df.columns.tolist()}")

# Create X and y
X = df[features]
y = df[target]

# Preprocess with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='passthrough'  # Keep zip_code unscaled
)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create pipeline
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1))
])

# Train model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nModel Results:")
print(f"XGBoost RMSE: ${rmse:,.2f}")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")

# Feature importance
feature_names = numeric_features + ['zip_code']
print("\nFeature Importance:")
for name, importance in zip(feature_names, pipeline.named_steps['regressor'].feature_importances_):
    print(f"{name}: {importance:.4f}")

# Sample of actual vs. predicted prices
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred,
    'Difference': y_test - y_pred
})
print("\nSample of Actual vs. Predicted Prices:")
print(results_df.head(10))



Features: ['bed', 'bath', 'acre_lot', 'zip_code', 'house_size', 'ppsf_zipcode']
Numeric features for scaling: ['bed', 'bath', 'acre_lot', 'house_size', 'ppsf_zipcode']

Model Results:
XGBoost RMSE: $157,633.43
Mean Absolute Error (MAE): $95,198.92
R² Score: 0.8281

Feature Importance:
bed: 0.0175
bath: 0.1021
acre_lot: 0.0223
house_size: 0.3833
ppsf_zipcode: 0.4621
zip_code: 0.0126

Sample of Actual vs. Predicted Prices:
        Actual Price  Predicted Price     Difference
524430      375000.0    463914.781250  -88914.781250
711473      342000.0    399033.906250  -57033.906250
730429      510000.0    475309.968750   34690.031250
149804     1675000.0    800231.500000  874768.500000
74707       475000.0    494328.593750  -19328.593750
625130      289500.0    380936.031250  -91436.031250
79687       699999.0    893581.250000 -193582.250000
532139      770000.0    717525.125000   52474.875000
391375      549000.0    471772.656250   77227.343750
271014      190000.0    172621.078125   1737

In [382]:
# Define model save path
model_dir = "/Users/carlokrups/code/JensKlug/zillows_real_estate/zillow/model/models"
os.makedirs(model_dir, exist_ok=True)

# Timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
model_path = os.path.join(model_dir, f"{timestamp}_xgboost_pipeline.joblib")

# Save pipeline
joblib.dump(pipeline, model_path)

print(f"\n✅ Model saved to: {model_path}")


✅ Model saved to: /Users/carlokrups/code/JensKlug/zillows_real_estate/zillow/model/models/20250611-141552_xgboost_pipeline.joblib
