In [3]:
#importing libraries
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [4]:
df = pd.read_csv('train.csv')

In [5]:
print(list(df.columns))

['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15']


In [6]:
# 1. Feature Engineering 
df['date'] = pd.to_datetime(df['date'])
df['sale_month'] = df['date'].dt.month
df['sale_year'] = df['date'].dt.year
df['house_age'] = df['sale_year'] - df['yr_built']
df['is_renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

# 2. Creating Log Target ( for more accuracy)
df['price_log'] = np.log1p(df['price'])

# 3. droping columns
X = df.drop(columns= ['id', 'date', 'price', 'yr_built', 'yr_renovated', 'zipcode','price_log'])

# 4. Seting the Target
y = df['price_log'] 

# 6. Spliting Data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Shape: {X_train.shape}")
print(f"Validation Shape: {X_val.shape}")
print("Final Features:")
print(list(X.columns))

Training Shape: (12967, 19)
Validation Shape: (3242, 19)
Final Features:
['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sale_month', 'sale_year', 'house_age', 'is_renovated']


### training model using only tabular data

In [45]:
print("Corrected Features:")
print(list(X_train.columns))

# 1. training XGBoost
model = xgb.XGBRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    early_stopping_rounds=50,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# 2. metrics calc
preds_log = model.predict(X_val)
preds_actual = np.expm1(preds_log) 
y_val_actual = np.expm1(y_val)

rmse = np.sqrt(mean_squared_error(y_val_actual, preds_actual))
r2 = r2_score(y_val_actual, preds_actual)
mae = mean_absolute_error(y_val_actual, preds_actual)

print("Results")
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

Corrected Features:
['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'sale_month', 'sale_year', 'house_age', 'is_renovated']
[0]	validation_0-rmse:0.52193
[100]	validation_0-rmse:0.28252
[200]	validation_0-rmse:0.20724
[300]	validation_0-rmse:0.18299
[400]	validation_0-rmse:0.17384
[500]	validation_0-rmse:0.16979
[600]	validation_0-rmse:0.16768
[700]	validation_0-rmse:0.16631
[800]	validation_0-rmse:0.16530
[900]	validation_0-rmse:0.16462
[1000]	validation_0-rmse:0.16401
[1100]	validation_0-rmse:0.16358
[1200]	validation_0-rmse:0.16309
[1300]	validation_0-rmse:0.16272
[1400]	validation_0-rmse:0.16235
[1500]	validation_0-rmse:0.16218
[1600]	validation_0-rmse:0.16194
[1700]	validation_0-rmse:0.16174
[1800]	validation_0-rmse:0.16150
[1900]	validation_0-rmse:0.16127
[1999]	validation_0-rmse:0.16123
Results
Mean Absolute Error (MAE): $64,324.95
RMSE: $115,35

In [None]:
#saving model prediction of test.csv
df_test = pd.read_csv('test.csv')

df_test['date'] = pd.to_datetime(df_test['date'])
df_test['sale_month'] = df_test['date'].dt.month
df_test['sale_year'] = df_test['date'].dt.year
df_test['house_age'] = df_test['sale_year'] - df_test['yr_built']
df_test['is_renovated'] = df_test['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

cols_to_drop = ['id', 'date', 'price', 'yr_built', 'yr_renovated', 'zipcode']
X_test = df_test.drop(columns=cols_to_drop, errors='ignore')

X_test = X_test[X_train.columns]

test_preds_log = model.predict(X_test)
test_preds_actual = np.expm1(test_preds_log)

submission = pd.DataFrame({'id': df_test['id'], 'predicted_price': test_preds_actual})
submission.to_csv('final_predictions.csv', index=False)
print("Saved predictions to 'final_predictions.csv'")

###  MULTIMODAL REGRESSION MODEL

In [None]:
# 1. LOADING DATA
df_tabular = df.drop(columns =['date','price','yr_built','yr_renovated','zipcode','sale_month','sale_year',])
df_visual = pd.read_csv('image_features.csv')

print(f"Tabular Data: {df_tabular.shape}")
print(f"Visual Data:  {df_visual.shape} ")

# 2. merging data 
df_hybrid = pd.merge(df_tabular, df_visual, on='id', how='inner')
print(f"Merged Dataset Shape: {df_hybrid.shape}")

# 3. TRAINING HYBRID MODEL
X = df_hybrid.drop(columns=['id', 'price_log'])
y = df_hybrid['price_log']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=4, n_jobs=-1)
model.fit(X_train, y_train)

# 4. metrics calc
preds_log = model.predict(X_val)
score = r2_score(y_val, preds_log)
mae = mean_absolute_error(np.expm1(y_val), np.expm1(preds_log))

print(f"Hybrid Model Results (on {len(y_val)} test houses):")
print(f"R² Score: {score:.4f}")
print(f"MAE Error: ${mae:,.2f}")

Tabular Data: (16209, 19)
Visual Data:  (9963, 2049) 
Merged Dataset Shape: (10054, 2067)
Hybrid Model Results (on 2011 test houses):
R² Score: 0.8658
MAE Error: $73,490.27


The architecture of this model.
```text
       [ Tabular Data ]                   [ Satellite Image ]
             |                                     |
    ( 20 numbers: age, bed... )           ( 200x200 pixel grid )
             |                                     |
   +---------v---------+                 +---------v---------+
   |   Dense Network   |                 |    CNN (VGG16)    |
   | (Math Processing) |                 | (Visual Processing)|
   +---------+---------+                 +---------+---------+
             |                                     |
      [ Feature Vector A ]                [ Feature Vector B ]
      ( e.g., 32 numbers )                ( e.g., 32 numbers )
             |                                     |
             +------------------+------------------+
                                |
                        [ CONCATENATE ]
                  ( Glue them together -> 64 numbers )
                                |
                      +---------v---------+
                      |   Final Layers    |
                      |  (Interpretation) |
                      +---------+---------+
                                |
                        [ PREDICTED PRICE ]

```