In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Load your enriched dataset from the previous step
df = pd.read_csv("transformed_tabular_v2.csv")
X = df.drop(columns=['log_price', 'lat', 'long'])
y = df['log_price']

def train_tabular_baseline(X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=101)
    oof_preds = np.zeros(len(X))
    
    print("üöÄ Training Advanced Tabular Ensemble...")
    
    for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
        
        # HistGradientBoosting: Very fast, handles large datasets well
        m1 = HistGradientBoostingRegressor(max_iter=1000, learning_rate=0.04, max_depth=10, random_state=101)
        # LightGBM: Leaf-wise growth for higher precision
        m2 = LGBMRegressor(n_estimators=1000, learning_rate=0.04, num_leaves=70, verbose=-1, random_state=101)
        # CatBoost: Excellent at handling numerical and categorical relationships
        m3 = CatBoostRegressor(iterations=1000, learning_rate=0.04, depth=8, verbose=0, random_seed=101)
        
        m1.fit(X_tr, y_tr)
        m2.fit(X_tr, y_tr)
        m3.fit(X_tr, y_tr)
        
        # Blended Prediction
        fold_preds = (0.3 * m1.predict(X_va)) + (0.35 * m2.predict(X_va)) + (0.35 * m3.predict(X_va))
        oof_preds[va_idx] = fold_preds
        print(f"‚úÖ Fold {fold} Complete")

    score = r2_score(y, oof_preds)
    return score, (m1, m2, m3)

tabular_r2, base_models = train_tabular_baseline(X, y)
print(f"\n--- Final Tabular R2: {tabular_r2:.5f} ---")


üöÄ Training Advanced Tabular Ensemble...
‚úÖ Fold 1 Complete
‚úÖ Fold 2 Complete
‚úÖ Fold 3 Complete
‚úÖ Fold 4 Complete
‚úÖ Fold 5 Complete

--- Final Tabular R2: 0.82320 ---


In [2]:
import os
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
import numpy as np

def extract_efficientnet_features(image_folder, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    print(f"üñ•Ô∏è  Using device: {device}")

    # Load Pre-trained EfficientNet-B0
    # Weights.DEFAULT ensures we use the best available ImageNet weights
    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    
    # Remove the final classification head to get the 1280-dimensional feature vector
    model.classifier = nn.Identity()
    model = model.to(device)
    model.eval()

    # Image transformations: EfficientNet-B0 expects 224x224
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    features = []
    # Sort files to ensure they match the order of your tabular dataframe
    # Assuming files are named 'tile_id_0.jpg', 'tile_id_1.jpg', etc.
    image_files = sorted([f for f in os.listdir(image_folder) if f.endswith('.jpg')],
                        key=lambda x: int(x.split('_')[-1].split('.')[0]))

    print(f"üñºÔ∏è  Extracting features from {len(image_files)} images...")
    
    with torch.no_grad():
        for img_name in tqdm(image_files):
            img_path = os.path.join(image_folder, img_name)
            try:
                img = Image.open(img_path).convert('RGB')
                input_tensor = preprocess(img).unsqueeze(0).to(device)
                
                # Extract the 1280-dimensional embedding
                feat = model(input_tensor)
                features.append(feat.cpu().numpy().flatten())
            except Exception as e:
                print(f"Error processing {img_name}: {e}")
                # Append zeros if image is corrupted to keep array alignment
                features.append(np.zeros(1280))

    return np.array(features)

visual_features_train = extract_efficientnet_features("property_visuals")
np.save("train_visual_embeddings.npy", visual_features_train)

üñ•Ô∏è  Using device: cpu
üñºÔ∏è  Extracting features from 16209 images...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16209/16209 [13:39<00:00, 19.77it/s]  


In [5]:
import pandas as pd


In [1]:
import numpy as np
import sklearn
print(np.__version__)
print(sklearn.__version__)


2.3.5
1.8.0


In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# 1. Prepare Data
# Using the enriched features from your preprocessing step
X_tab = pd.read_csv("transformed_tabular_v2.csv").drop(columns=['log_price', 'lat', 'long'])
y = pd.read_csv("transformed_tabular_v2.csv")['log_price']

X_train, X_test, y_train, y_test = train_test_split(X_tab, y, test_size=0.2, random_state=42)

# 2. Initialize Models
rf = RandomForestRegressor(n_estimators=200, max_depth=12, random_state=42)
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.03, max_depth=7, random_state=42)
cat = CatBoostRegressor(iterations=1000, learning_rate=0.03, depth=7, verbose=0, random_seed=42)

# 3. Create the Ensemble (Weighted Voting)
# We give higher weight to CatBoost as it's typically the strongest on real estate data
tabular_ensemble = VotingRegressor(
    estimators=[('rf', rf), ('xgb', xgb), ('cat', cat)],
    weights=[1, 2, 2]
)

print("üöÄ Training Tabular Ensemble...")
tabular_ensemble.fit(X_train, y_train)

# 4. Evaluation
tabular_preds = tabular_ensemble.predict(X_test)
r2_tabular = r2_score(y_test, tabular_preds)
print(f"üìä Tabular Ensemble R2 Score: {r2_tabular:.5f}")

üöÄ Training Tabular Ensemble...
üìä Tabular Ensemble R2 Score: 0.82018


In [2]:
from sklearn.decomposition import PCA

# 1. Load pre-extracted visual features
# (Assuming visual_features_train was saved from the EfficientNet step)
vis_embeddings = np.load("train_visual_embeddings.npy")

# 2. Dimensionality Reduction
pca = PCA(n_components=50, random_state=42)
vis_reduced = pca.fit_transform(vis_embeddings)

# 3. Combine Tabular + Visual for the Hybrid dataset
X_hybrid = np.hstack([X_tab.values, vis_reduced])
X_h_train, X_h_test, y_h_train, y_h_test = train_test_split(X_hybrid, y, test_size=0.2, random_state=42)

# 4. Hybrid Regressor
# We use a high-depth CatBoost to handle the high-dimensional fused space
hybrid_model = CatBoostRegressor(iterations=1500, learning_rate=0.03, depth=8, verbose=0, random_seed=42)

print("üß¨ Training Hybrid Multimodal Model...")
hybrid_model.fit(X_h_train, y_h_train)

# 5. Evaluation
hybrid_preds = hybrid_model.predict(X_h_test)
r2_hybrid = r2_score(y_h_test, hybrid_preds)
print(f"üñºÔ∏è Hybrid Multimodal R2 Score: {r2_hybrid:.5f}")

üß¨ Training Hybrid Multimodal Model...
üñºÔ∏è Hybrid Multimodal R2 Score: 0.83545


In [3]:
print("\n" + "="*40)
print(f"üèÜ FINAL BATTLE RESULTS")
print(f"Standard Tabular Ensemble: {r2_tabular:.5f}")
print(f"Hybrid Visual + Tabular:   {r2_hybrid:.5f}")
print("="*40)

if r2_hybrid > r2_tabular:
    print("‚ú® SUCCESS: Satellite imagery improves prediction accuracy.")
    print("Action: Using Hybrid Pipeline for Test Data...")
    
    # Process test imagery from 'property_visuals_test'
    # vis_test = extract_efficientnet_features("property_visuals_test")
    # vis_test_pca = pca.transform(vis_test)
    # Combine and predict...
else:
    print("‚ö†Ô∏è  NOTICE: Tabular data alone is more efficient for this split.")
    print("Action: Using Tabular Ensemble for Test Data...")


üèÜ FINAL BATTLE RESULTS
Standard Tabular Ensemble: 0.82018
Hybrid Visual + Tabular:   0.83545
‚ú® SUCCESS: Satellite imagery improves prediction accuracy.
Action: Using Hybrid Pipeline for Test Data...


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load original raw data to ensure no useful columns are missing
train_raw = pd.read_csv("train(1)(train(1)).csv")

def fallback_preprocessing(df):
    X = df.copy()
    
    # 1. DO NOT DROP LAT/LONG - Location is 90% of real estate value
    # 2. Convert 'date' to 'house_age_at_sale'
    X['date'] = pd.to_datetime(X['date'])
    X['age_at_sale'] = X['date'].dt.year - X['yr_built']
    
    # 3. Create a 'renovated' binary flag
    X['is_renovated'] = (X['yr_renovated'] > 0).astype(int)
    
    # 4. Log Transform area columns (reduces outlier impact)
    area_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
    for col in area_cols:
        X[f'log_{col}'] = np.log1p(X[col])
    
    # 5. Drop the original versions of transformed columns + IDs
    cols_to_drop = ['id', 'date', 'yr_built', 'yr_renovated'] + area_cols
    X = X.drop(columns=cols_to_drop)
    
    return X

X_final = fallback_preprocessing(train_raw.drop(columns=['price']))
y_log = np.log1p(train_raw['price'])

X_train, X_test, y_train, y_test = train_test_split(X_final, y_log, test_size=0.2, random_state=42)

In [5]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

# Initialize with "Safe" parameters
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, subsample=0.8, random_state=42)
cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0, random_seed=42)
rf = RandomForestRegressor(n_estimators=300, max_depth=15, random_state=42)

print("üöÄ Training Reliable Ensemble...")
xgb.fit(X_train, y_train)
cat.fit(X_train, y_train)
rf.fit(X_train, y_train)

# Blend: Giving more weight to CatBoost and XGBoost
y_pred = (0.4 * xgb.predict(X_test)) + (0.4 * cat.predict(X_test)) + (0.2 * rf.predict(X_test))

r2_tabular = r2_score(y_test, y_pred)
print(f"‚úÖ Fallback Tabular R2: {r2_tabular:.5f}")

üöÄ Training Reliable Ensemble...
‚úÖ Fallback Tabular R2: 0.90561


In [6]:
# Assuming you have EfficientNet features extracted as 'vis_embeddings'
from sklearn.decomposition import PCA

pca = PCA(n_components=50, random_state=42)
vis_reduced = pca.fit_transform(vis_embeddings)

# Combine with our 'Restored' Tabular features
X_hybrid = np.hstack([X_final.values, vis_reduced])
X_h_train, X_h_test, y_h_train, y_h_test = train_test_split(X_hybrid, y_log, test_size=0.2, random_state=42)

# Hybrid Model
h_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0)
h_model.fit(X_h_train, y_h_train)

r2_hybrid = r2_score(y_h_test, h_model.predict(X_h_test))
print(f"üñºÔ∏è  Fallback Hybrid R2: {r2_hybrid:.5f}")

# Final Comparison
if r2_hybrid > r2_tabular:
    print("üèÜ Hybrid is superior. Predicting on 'property_visuals_test'...")
else:
    print("üèÜ Tabular is superior. Using restored columns for prediction.")

üñºÔ∏è  Fallback Hybrid R2: 0.90332
üèÜ Tabular is superior. Using restored columns for prediction.


In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# 1. LOAD & CLEAN DATA
def quick_preprocess(df, is_train=True):
    X = df.copy()
    X.columns = X.columns.str.strip()
    
    # Feature Engineering (The "Safe" Set)
    X['date'] = pd.to_datetime(X['date'])
    X['age_at_sale'] = X['date'].dt.year - X['yr_built']
    X['is_renovated'] = (X['yr_renovated'] > 0).astype(int)
    
    # Log area columns
    area_cols = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
    for col in area_cols:
        X[f'log_{col}'] = np.log1p(X[col])
    
    y = None
    if is_train:
        y = np.log1p(X['price'])
        X = X.drop(columns=['price'])
        
    X = X.drop(columns=['id', 'date', 'yr_built', 'yr_renovated'] + area_cols)
    return X, y

# Load datasets
train_raw = pd.read_csv("train(1)(train(1)).csv")
test_raw = pd.read_csv("test2(test(1)).csv")

X_final, y_log = quick_preprocess(train_raw, is_train=True)
X_test_final, _ = quick_preprocess(test_raw, is_train=False)

# 2. TRAIN THE WINNING ENSEMBLE
X_train, X_val, y_train, y_val = train_test_split(X_final, y_log, test_size=0.15, random_state=42)

print("‚ö° Training Tabular Ensemble (Fast Mode)...")
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=6, random_state=42).fit(X_train, y_train)
cat = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, verbose=0, random_seed=42).fit(X_train, y_train)
rf = RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42).fit(X_train, y_train)

# 3. EVALUATE & PREDICT
def get_blend(models, data):
    m1, m2, m3 = models
    return (0.4 * m1.predict(data)) + (0.4 * m2.predict(data)) + (0.2 * m3.predict(data))

val_preds = get_blend((xgb, cat, rf), X_val)
print(f"üìä Validation R2 Score: {r2_score(y_val, val_preds):.5f}")

# 4. FINAL SUBMISSION
final_log_preds = get_blend((xgb, cat, rf), X_test_final)
final_prices = np.expm1(final_log_preds)

submission = pd.DataFrame({'id': test_raw['id'], 'predicted_price': final_prices})
submission.to_csv("tabular_final_submission.csv", index=False)
print("üíæ Done! Final predictions saved: 'tabular_final_submission.csv'")

‚ö° Training Tabular Ensemble (Fast Mode)...
üìä Validation R2 Score: 0.90624
üíæ Done! Final predictions saved: 'tabular_final_submission.csv'
