<a href="https://colab.research.google.com/github/MangeshVR1546/Satellite-Imagery-Based-Property-Valuation-Project/blob/main/multimodel_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Baseline model training using only tabular data**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# 1. Load your training data (tabular only)
df = pd.read_csv("/content/train(1)(train(1)).csv")

# 2. Basic preprocessing / feature selection
#    Adjust this list to match the columns you want to use
feature_cols = [
    "bedrooms", "bathrooms", "sqft_living", "sqft_lot",
    "floors", "waterfront", "view", "condition", "grade",
    "sqft_above", "sqft_basement", "lat", "long",
    "sqft_living15", "sqft_lot15"
]

# Drop rows with missing target
df = df.dropna(subset=["price"])

X = df[feature_cols].fillna(0)
y = df["price"].values

# Optional: log-transform target for numeric stability
y_log = np.log1p(y)

# 3. Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

# 4. Scale features (helps tree models a bit; essential for linear models)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)

# 5. Baseline model: XGBoost regressor (tabular only)
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)
model.fit(X_train_s, y_train)

# 6. Evaluation: R² and price RMSE
y_val_pred_log = model.predict(X_val_s)

# Convert back to price space
y_val_true_price = np.expm1(y_val)
y_val_pred_price = np.expm1(y_val_pred_log)

r2 = r2_score(y_val_true_price, y_val_pred_price)          # [web:275][web:285]
rmse = np.sqrt(mean_squared_error(y_val_true_price, y_val_pred_price))  # [web:283][web:281]

print("=== Baseline Tabular Model ===")
print(f"R² score:       {r2:.4f}")
print(f"Price RMSE:     ${rmse:,.0f}")




=== Baseline Tabular Model ===
R² score:       0.8859
Price RMSE:     $119,645


**Multimodel Training**



In [None]:

"""
Multimodal House Price Prediction Pipeline
Tabular + Satellite Image Features → XGBoost → Price Prediction
"""
%pip install category_encoders optuna
import pandas as pd
import numpy as np
import optuna
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from category_encoders import TargetEncoder

def train_multimodal_pipeline(tabular_path, img_features_path, img_ids_path):
    # Load data
    tabular_data = pd.read_csv(tabular_path)
    image_features = np.load(img_features_path)
    img_house_ids = pd.read_csv(img_ids_path)['id'].values

    print(f"Tabular: {len(tabular_data)} | Images: {len(image_features)}")

    # Align
    common_ids = np.intersect1d(tabular_data['id'].values, img_house_ids)
    tab_aligned = tabular_data[tabular_data['id'].isin(common_ids)].reset_index(drop=True)
    img_idx = [np.where(img_house_ids == id_)[0][0] for id_ in tab_aligned['id']]
    img_aligned = image_features[img_idx]

    # Zipcode encoding
    zip_encoder = TargetEncoder(cols=['zipcode'])
    tab_aligned['zipcode_encoded'] = zip_encoder.fit_transform(
        tab_aligned[['zipcode']], np.log1p(tab_aligned['price'])
    )['zipcode']

    # Features (21 tabular)
    tab_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                    'waterfront', 'view', 'condition', 'grade', 'sqft_above',
                    'sqft_basement', 'lat', 'long', 'sqft_living15', 'sqft_lot15',
                    'log_dist_city', 'log_dist_water', 'log_dist_tech',
                    'location_cluster', 'is_renovated', 'zipcode_encoded']

    X_tab = tab_aligned[tab_features].fillna(0).values
    X_img = img_aligned
    y = np.log1p(tab_aligned['price'].values)

    # Split
    X_tab_train, X_tab_val, X_img_train, X_img_val, y_train, y_val = train_test_split(
        X_tab, X_img, y, test_size=0.2, random_state=12345
    )

    # Scale tabular
    tab_scaler = RobustScaler()
    X_tab_train_s = tab_scaler.fit_transform(X_tab_train)
    X_tab_val_s = tab_scaler.transform(X_tab_val)

    # Scale + PCA images
    img_scaler = RobustScaler()
    X_img_train_s = img_scaler.fit_transform(X_img_train)
    X_img_val_s = img_scaler.transform(X_img_val)

    pca = PCA(n_components=64, random_state=42)
    X_img_train_pca = pca.fit_transform(X_img_train_s)
    X_img_val_pca = pca.transform(X_img_val_s)

    # Multimodal
    X_train_mm = np.hstack([X_tab_train_s, X_img_train_pca])
    X_val_mm = np.hstack([X_tab_val_s, X_img_val_pca])

    # Optuna
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 800, 2000),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
            'max_depth': trial.suggest_int('max_depth', 5, 9),
            'subsample': trial.suggest_float('subsample', 0.8, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 5.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 5.0, log=True),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'random_state': 42, 'tree_method': 'hist'
        }
        model = XGBRegressor(**params)
        model.fit(X_train_mm, y_train)
        return np.sqrt(mean_squared_error(y_val, model.predict(X_val_mm)))

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50)

    # Best model
    best_params = study.best_params.copy()
    best_params.update({'random_state': 42, 'tree_method': 'hist'})
    final_model = XGBRegressor(**best_params)
    final_model.fit(X_train_mm, y_train)

    # Pipeline
    pipeline = {
        'model': final_model, 'tab_scaler': tab_scaler, 'img_scaler': img_scaler,
        'pca': pca, 'zip_encoder': zip_encoder, 'tab_features': tab_features,
        'img_dim': 512, 'study': study
    }
    joblib.dump(pipeline, 'multimodal_pipeline.pkl')

    # Metrics
    y_pred = final_model.predict(X_val_mm)
    print(f"R²: {r2_score(y_val, y_pred):.4f}")
    print(f"Price RMSE: ${np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(y_pred))):,.0f}")

    return pipeline

# Run
pipeline = train_multimodal_pipeline(
    '/content/preprocessed_data.csv',
    '/content/image_features_training.npy',
    '/content/image_feature_ids_training.csv'
)

Collecting category_encoders
  Downloading category_encoders-2.9.0-py3-none-any.whl.metadata (7.9 kB)
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading category_encoders-2.9.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna, category_encoders
Successfully installed category_encoders-2.9.0 colorlog-6.10.1 optuna-4.6.0
Tabular: 16209 | Images: 3241


[I 2026-01-04 16:29:21,249] A new study created in memory with name: no-name-0a81d888-1b7e-4464-8017-906954a89ae9
[I 2026-01-04 16:30:08,072] Trial 0 finished with value: 0.20503230726484747 and parameters: {'n_estimators': 1674, 'learning_rate': 0.013953586037052972, 'max_depth': 9, 'subsample': 0.944475261915234, 'colsample_bytree': 0.9101016803225842, 'reg_alpha': 1.654595085160443, 'reg_lambda': 0.04392755508293196, 'min_child_weight': 6}. Best is trial 0 with value: 0.20503230726484747.
[I 2026-01-04 16:30:31,565] Trial 1 finished with value: 0.19925702970746612 and parameters: {'n_estimators': 1178, 'learning_rate': 0.03364255673576441, 'max_depth': 7, 'subsample': 0.9381116126585299, 'colsample_bytree': 0.8142704981064148, 'reg_alpha': 0.022224158188512935, 'reg_lambda': 1.5202857519788304, 'min_child_weight': 5}. Best is trial 1 with value: 0.19925702970746612.
[I 2026-01-04 16:30:57,228] Trial 2 finished with value: 0.19847260530031605 and parameters: {'n_estimators': 1204, 'l

**Price prediction using multimodel on Test_data**

In [None]:
"""
Live Satellite Prediction for Test Set
House lat/lon → Mapbox → ResNet50 → Features → Model
"""

import pandas as pd
import numpy as np
import joblib
import requests
from PIL import Image
import io
import torch
import torchvision.transforms as T
import torchvision.models as models

def predict_test_set_live_satellite(test_path, pipeline_path='multimodal_pipeline.pkl', mapbox_token=None):
    # Load data
    test_df = pd.read_csv(test_path)
    pipeline = joblib.load(pipeline_path)

    print(f"Predicting {len(test_df)} test houses with LIVE satellite")

    # Dedupe
    test_df = test_df.drop_duplicates(subset=['id'], keep='first').reset_index(drop=True)

    # Zipcode encoding
    test_df['zipcode_encoded'] = pipeline['zip_encoder'].transform(test_df[['zipcode']])['zipcode']

    # ResNet50 feature extractor
    model_feat = models.resnet50(pretrained=True)
    model_feat.fc = torch.nn.Identity()
    model_feat.eval()
    preprocess = T.Compose([
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    predictions = []

    for idx, house in test_df.iterrows():
        print(f"Processing house {house['id']} ({idx+1}/{len(test_df)})")

        # Tabular features
        X_tab = house[pipeline['tab_features']].values.reshape(1, -1)
        X_tab_s = pipeline['tab_scaler'].transform(X_tab)

        # LIVE SATELLITE IMAGE
        url = f"https://api.mapbox.com/styles/v1/mapbox/satellite-v9/static/{house['long']},{house['lat']},18/512x512@2x?access_token={mapbox_token}"
        img_data = requests.get(url, timeout=10).content
        img = Image.open(io.BytesIO(img_data)).convert('RGB')

        # Extract SPECIFIC house features
        img_t = preprocess(img).unsqueeze(0)
        with torch.no_grad():
            img_feat = model_feat(img_t).numpy().flatten()[:512]  # Match training dim

        # Transform image features
        X_img_s = pipeline['img_scaler'].transform(img_feat.reshape(1, -1))
        X_img_pca = pipeline['pca'].transform(X_img_s)

        # Multimodal prediction
        X_mm = np.hstack([X_tab_s, X_img_pca])
        pred_log = pipeline['model'].predict(X_mm)[0]
        pred_price = np.expm1(pred_log).round(0)
        predictions.append(pred_price)

    # Submission
    submission = pd.DataFrame({
        'id': test_df['id'].astype(int),
        'predicted_price': predictions
    })
    submission.to_csv('24117072_final.csv', index=False)

    print(f"✅ 24117072_final.csv saved: {len(submission)} predictions")
    return submission

# Usage
MAPBOX_TOKEN = "pk.eyJ1IjoibWFuZ2VzaDExMTExIiwiYSI6ImNtanlyc2ZqZTBmcmYzZnNjc2Z6bzl0MTAifQ.62QGA4F-FifDpt1b5k9L8A"
submission = predict_test_set_live_satellite(
    '/content/preprocessed_data.csv',
    mapbox_token=MAPBOX_TOKEN
)
