In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import pgeocode
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

In [2]:
# Read HouseTS.csv into area_df
area_df = pd.read_csv('../raw_data/HouseTS.csv')

# Read realtor-data.csv into house_df
house_df = pd.read_csv('../raw_data/realtor-data.csv')

In [3]:
# Create list of unique zipcodes in area_df
unique_zipcodes_area_df = area_df['zipcode'].unique().tolist()

# Filter house_df by unique_zipcoes_area_df
house_df = house_df[house_df['zip_code'].isin(unique_zipcodes_area_df)]

In [4]:
def clean_data(df):
    # Drop columns 'brokered_by', 'status'
    df = df.drop(columns=['brokered_by', 'status'])

     # Drop duplicates
    df = df.drop_duplicates()

    # Drop columns 'street', 'city', 'state' and 'prev_sold_date'
    df = df.drop(columns=['street', 'city', 'state', 'prev_sold_date'])

    # Drop rows with NaN values from 'price'
    df = df.dropna(subset=['price'])

    # Create list where 'bed' & 'bath' & 'house_size' are NaN
    nan_values = df[
        (pd.isna(df['bed'])) &
        (pd.isna(df['bath'])) &
        (pd.isna(df['house_size']))
    ]

    # Filter out rows that are in nan_values because we assume they are land sales
    df = df[~df.index.isin(nan_values.index)]

    # Impute missing data
    df['bed'] = df['bed'].fillna(df['bed'].median())
    df['bath'] = df['bath'].fillna(df['bath'].median())
    df['house_size'] = df['house_size'].fillna(df['house_size'].median())
    df['acre_lot'] = df['acre_lot'].fillna(0)

    # Step 2: Calculate PPSF for each row
    df['ppsf'] = df['price'] / df['house_size']

    # Step 3: Calculate median PPSF per zip_code
    ppsf_median = df.groupby('zip_code')['ppsf'].median().reset_index(name='ppsf_zipcode')

    # Step 4: Merge median PPSF back to df
    df = df.merge(ppsf_median, on='zip_code', how='left')

    # Drop temporary ppsf column
    df = df.drop(columns=['ppsf'])

    # Calculate boundaries for 'price', 'acre_lot', 'house_size', 'ppsf_zipcode'
    lower_price = df['price'].quantile(0.03)
    upper_price = df['price'].quantile(0.97)
    upper_house_size = df['house_size'].quantile(0.99)
    lower_acre_lot = df['acre_lot'].quantile(0.01)
    upper_acre_lot = df['acre_lot'].quantile(0.99)
    lower_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.03)
    upper_ppsf_zipcode = df['ppsf_zipcode'].quantile(0.97)

    # Apply boundaries to df
    df = df[
        (df['price'] > lower_price) &
        (df['price'] < upper_price) &
        (df['bed'] < 14) &
        (df['bath'] < 12) &
        (df['house_size'] < upper_house_size) &
        (df['acre_lot'] > lower_acre_lot) &
        (df['acre_lot'] < upper_acre_lot) &
        (df['ppsf_zipcode'] > lower_ppsf_zipcode) &
        (df['ppsf_zipcode'] < upper_ppsf_zipcode)
        ]

    return df

In [5]:
# Clean df
cleaned_house_df = clean_data(house_df)

In [6]:
def convert_zipcode(df):
    # Convert zip_code column to 5-digit string
    df['zip_code'] = df['zip_code'].astype(str).str.replace('\.0$', '', regex=True).str.zfill(5)

    # Get unique zip codes
    unique_zips = df['zip_code'].unique()

    # Initialize pgeocode for US
    nomi = pgeocode.Nominatim('us')

    # Function to get coordinates
    def get_coordinates(zip_code):
        try:
            result = nomi.query_postal_code(zip_code)
            if result.empty or pd.isna(result.latitude):
                return pd.Series([None, None])
            return pd.Series([result.latitude, result.longitude])
        except:
            return pd.Series([None, None])

    # Create DataFrame for unique zip codes
    zip_coords = pd.DataFrame(unique_zips, columns=['zip_code'])
    zip_coords[['latitude', 'longitude']] = zip_coords.apply(lambda row: get_coordinates(row['zip_code']), axis=1)

    # Map coordinates back to filtered_house_df
    coords_dict = zip_coords.set_index('zip_code')[['latitude', 'longitude']].to_dict('index')
    df['latitude'] = df['zip_code'].map(lambda x: coords_dict.get(x, {}).get('latitude'))
    df['longitude'] = df['zip_code'].map(lambda x: coords_dict.get(x, {}).get('longitude'))

    # Drop 'zip_code' column
    df = df.drop(columns=['zip_code'])

    return df

In [7]:
# Convert zipcodes to coordinates
cleaned_house_df = convert_zipcode(cleaned_house_df)
#cleaned_house_df

In [8]:
cleaned_house_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,house_size,ppsf_zipcode,latitude,longitude
count,608684.0,608684.0,608684.0,608684.0,608684.0,608684.0,608684.0,608684.0
mean,581195.0,3.423869,2.621621,0.418884,2059.583033,284.782134,36.332211,-95.276863
std,380775.3,1.067719,1.029695,0.914755,916.177339,147.29842,5.403365,17.147978
min,115200.0,1.0,1.0,0.01,100.0,125.323069,25.2846,-123.6335
25%,329497.5,3.0,2.0,0.12,1436.0,180.838951,32.9247,-115.2801
50%,475000.0,3.0,2.0,0.17,1809.0,235.627284,34.749,-93.2886
75%,699999.0,4.0,3.0,0.29,2476.0,330.76075,40.40405,-80.4146
max,2499999.0,13.0,11.0,9.99,6571.0,952.941176,48.2395,-70.6194


In [9]:
# # Work on a copy to avoid SettingWithCopyWarning
# df = cleaned_house_df.copy()
# df = df.drop(columns=['latitude', 'longitude'])

# # Define features and target
# target = 'price'
# features = [col for col in df.columns if col != target]  # Exclude price
# numeric_features = [col for col in features if col != 'zip_code']  # Exclude zip_code

# # Verify columns
# print("\nFeatures:", features)
# print("Numeric features for scaling:", numeric_features)
# if target not in df.columns:
#     raise ValueError(f"'{target}' column not found. Available columns: {df.columns.tolist()}")

# # Create X and y
# X = df[features]
# y = df[target]

# # Preprocess with ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_features)
#     ],
#     remainder='passthrough'  # Keep zip_code unscaled
# )

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Create pipeline
# from sklearn.pipeline import Pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1))
# ])

# # Train model
# pipeline.fit(X_train, y_train)

# # Predict and evaluate
# y_pred = pipeline.predict(X_test)

# # Calculate metrics
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Print results
# print("\nModel Results:")
# print(f"XGBoost RMSE: ${rmse:,.2f}")
# print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
# print(f"R² Score: {r2:.4f}")

# # Feature importance
# feature_names = numeric_features + ['zip_code']
# print("\nFeature Importance:")
# for name, importance in zip(feature_names, pipeline.named_steps['regressor'].feature_importances_):
#     print(f"{name}: {importance:.4f}")

# # Sample of actual vs. predicted prices
# results_df = pd.DataFrame({
#     'Actual Price': y_test,
#     'Predicted Price': y_pred,
#     'Difference': y_test - y_pred
# })
# print("\nSample of Actual vs. Predicted Prices:")
# print(results_df.head(10))


In [10]:
# import pandas as pd
# import numpy as np
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.pipeline import Pipeline
# from xgboost import XGBRegressor

# # 1. Work on a copy to avoid SettingWithCopyWarning
# df = cleaned_house_df.copy()
# df = df.drop(columns=['latitude', 'longitude'])

# # 2. Define features and target
# target = 'price'
# features = [col for col in df.columns if col != target]  # Exclude price
# numeric_features = [col for col in features if col != 'zip_code']  # Exclude zip_code

# # Verify columns
# print("\nFeatures:", features)
# print("Numeric features for scaling:", numeric_features)
# if target not in df.columns:
#     raise ValueError(f"'{target}' column not found. Available columns: {df.columns.tolist()}")

# # 3. Create X and y
# X = df[features]
# y = df[target]

# # 4. Preprocess with ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numeric_features)  # Scale numeric features
#     ],
#     remainder='passthrough'  # Keep zip_code unscaled
# )

# # 5. Split data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # 6. Create pipeline with XGBoost
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor', XGBRegressor(random_state=42, n_jobs=-1))
# ])

# # 7. Define hyperparameter grid for GridSearchCV
# param_grid = {
#     'regressor__n_estimators': [100, 200],  # Number of trees
#     'regressor__learning_rate': [0.01, 0.1],  # Step size for boosting
#     'regressor__max_depth': [3, 5],  # Depth of trees
#     'regressor__min_child_weight': [1, 3]  # Minimum sum of instance weight needed in a child
# }

# # 8. Perform GridSearchCV
# grid_search = GridSearchCV(
#     pipeline,
#     param_grid,
#     cv=3,  # 3-fold cross-validation
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,  # Use all cores
#     verbose=1
# )

# # 9. Fit the model
# grid_search.fit(X_train, y_train)

# # 10. Evaluate
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)

# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# # Print results
# print("\nModel Results:")
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best cross-validated RMSE: ${np.sqrt(-grid_search.best_score_):,.2f}")
# print(f"Test RMSE: ${rmse:,.2f}")
# print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
# print(f"R² Score: {r2:.4f}")

# # 11. Feature importance
# feature_names = numeric_features + ['zip_code']
# regressor = best_model.named_steps['regressor']
# print("\nFeature Importance:")
# for name, importance in zip(feature_names, regressor.feature_importances_):
#     print(f"{name}: {importance:.4f}")

# # 12. Sample of actual vs. predicted prices
# results_df = pd.DataFrame({
#     'Actual Price': y_test,
#     'Predicted Price': y_pred,
#     'Difference': y_test - y_pred
# })
# print("\nSample of Actual vs. Predicted Prices:")
# print(results_df.head(10))

In [11]:
print("test")

test


In [12]:

# 1. Work on a copy to avoid SettingWithCopyWarning
df = cleaned_house_df.copy()

# 2. Define features and target
target = 'price'
features = [col for col in df.columns if col != target]  # Exclude price
numeric_features = [col for col in features if col != 'zip_code']  # Exclude zip_code

# Verify columns
print("\nFeatures:", features)
print("Numeric features for scaling:", numeric_features)
if target not in df.columns:
    raise ValueError(f"'{target}' column not found. Available columns: {df.columns.tolist()}")

# 3. Create X and y
X = df[features]
y = df[target]

# 4. Preprocess with ColumnTransformer (using RobustScaler)
preprocessor = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), ['latitude', 'longitude']),
        ('rob', RobustScaler(), ['bed', 'bath', 'acre_lot', 'house_size', 'ppsf_zipcode'])
    ],
    remainder='passthrough'
)
# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,)

# 6. Create pipeline with XGBoost
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(random_state=42, n_jobs=-1))
])

# 7. Define hyperparameter grid for GridSearchCV
param_grid = {
    'regressor__n_estimators': [100, 200],  # Number of trees
    'regressor__learning_rate': [0.01, 0.1],  # Step size for boosting
    'regressor__max_depth': [3, 5],  # Depth of trees
    'regressor__min_child_weight': [1, 3]  # Minimum sum of instance weight needed in a child
}

# 8. Perform GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,  # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all cores
    verbose=1
)

# 9. Fit the model
grid_search.fit(X_train, y_train)

# 10. Evaluate
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("\nModel Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validated RMSE: ${np.sqrt(-grid_search.best_score_):,.2f}")
print(f"Test RMSE: ${rmse:,.2f}")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"R² Score: {r2:.4f}")

# 11. Feature importance
feature_names = numeric_features + ['zip_code']
regressor = best_model.named_steps['regressor']
print("\nFeature Importance:")
for name, importance in zip(feature_names, regressor.feature_importances_):
    print(f"{name}: {importance:.4f}")

# 12. Sample of actual vs. predicted prices
results_df = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred,
    'Difference': y_test - y_pred
})
print("\nSample of Actual vs. Predicted Prices:")
print(results_df.head(10))


Features: ['bed', 'bath', 'acre_lot', 'house_size', 'ppsf_zipcode', 'latitude', 'longitude']
Numeric features for scaling: ['bed', 'bath', 'acre_lot', 'house_size', 'ppsf_zipcode', 'latitude', 'longitude']


NameError: name 'RobustScaler' is not defined