# House Prices: Advanced Regression Techniques

This notebook covers the end-to-end process for the House Prices Kaggle competition.

## Steps:
1. **EDA**: Analyze target variable and missing values.
2. **Feature Engineering**: Create new features and encode categoricals.
3. **Modeling**: Train and evaluate Linear, Ridge, Lasso, Random Forest, and XGBoost.
4. **Submission**: Generate `submission.csv` and save the best model.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Configuration
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

## 1. Load Data

In [None]:
try:
    train_df = pd.read_csv('../data/train.csv')
    test_df = pd.read_csv('../data/test.csv')
    print("Data loaded successfully!")
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
except FileNotFoundError:
    print("Error: Files not found. Please ensure 'train.csv' and 'test.csv' are in the 'data/' directory.")

## 2. Exploratory Data Analysis (EDA)

In [None]:
if 'train_df' in locals():
    plt.figure(figsize=(10, 5))
    sns.histplot(train_df['SalePrice'], kde=True)
    plt.title('Distribution of SalePrice')
    plt.show()
    
    print("Skewness: %f" % train_df['SalePrice'].skew())

### Missing Values

In [None]:
if 'train_df' in locals():
    missing = train_df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    if not missing.empty:
        plt.figure(figsize=(12, 6))
        missing.plot.bar()
        plt.title('Missing Values by Feature')
        plt.show()
    else:
        print("No missing values found.")

## 3. Feature Engineering

In [None]:
def feature_engineering(df):
    df_eng = df.copy()
    
    # Handle Missing Values (Simple imputation for baseline)
    # Categorical: Fill with 'None' or Mode
    cat_cols = df_eng.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df_eng[col] = df_eng[col].fillna('None')
        
    # Numerical: Fill with Median
    num_cols = df_eng.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        df_eng[col] = df_eng[col].fillna(df_eng[col].median())
    
    # 1. Total Square Footage
    # Using fillna(0) just in case, though we imputed above
    df_eng['TotalSF'] = df_eng['TotalBsmtSF'] + df_eng['1stFlrSF'] + df_eng['2ndFlrSF']
    
    # 2. Total Porch Area
    df_eng['TotalPorchSF'] = (df_eng['OpenPorchSF'] + df_eng['3SsnPorch'] +
                              df_eng['EnclosedPorch'] + df_eng['ScreenPorch'] +
                              df_eng['WoodDeckSF'])
                              
    # 3. House Age and Garage Age
    # Ensure years are valid. If YearBuilt > YrSold (data error), set Age to 0
    df_eng['HouseAge'] = df_eng['YrSold'] - df_eng['YearBuilt']
    df_eng['HouseAge'] = df_eng['HouseAge'].apply(lambda x: max(0, x))
    
    # Handle GarageYrBlt missingness logic specifically if needed, but we imputed median above.
    # A better approach for GarageYrBlt is usually to set it to YearBuilt if missing, 
    # but our generic median imputation covers it for a baseline script.
    # Let's refine it slightly to be more logical if the column exists originaly with NaNs
    if 'GarageYrBlt' in df.columns and df['GarageYrBlt'].isnull().any():
         # Re-impute specifically: if no garage, use YearBuilt
         df_eng['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['YearBuilt'])
    
    df_eng['GarageAge'] = df_eng['YrSold'] - df_eng['GarageYrBlt']
    df_eng['GarageAge'] = df_eng['GarageAge'].apply(lambda x: max(0, x))
    
    return df_eng

if 'train_df' in locals():
    # Log Transform Target
    train_df['SalePrice'] = np.log1p(train_df['SalePrice'])
    
    # Store Id for submission
    test_ids = test_df['Id']
    
    # Drop Id column for training
    train_df.drop('Id', axis=1, inplace=True)
    test_df.drop('Id', axis=1, inplace=True)
    
    # Combine for consistent encoding
    ntrain = train_df.shape[0]
    ntest = test_df.shape[0]
    y_train_full = train_df['SalePrice'].values
    
    all_data = pd.concat((train_df.drop('SalePrice', axis=1), test_df)).reset_index(drop=True)
    
    # Apply Feature Engineering
    all_data = feature_engineering(all_data)
    
    # One-Hot Encoding
    all_data = pd.get_dummies(all_data)
    
    # Split back
    X_train_full = all_data[:ntrain]
    X_test_full = all_data[ntrain:]
    
    print("Feature Engineering Complete.")
    print(f"New shape: {X_train_full.shape}")

## 4. Model Training & Evaluation

In [None]:
if 'X_train_full' in locals():
    # Split for validation
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
    
    # Preprocessing (Scaling) - important for Linear/Ridge/Lasso
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge': Ridge(alpha=1.0),
        'Lasso': Lasso(alpha=0.001),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
        'XGBoost': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, n_jobs=-1)
    }
    
    results = {}
    
    print("Model Evaluation (RMSE on Log Prices):")
    print("-"*40)
    
    best_model_name = ""
    best_rmse = float('inf')
    
    for name, model in models.items():
        if name in ['Linear Regression', 'Ridge', 'Lasso']:
            model.fit(X_train_scaled, y_train)
            preds = model.predict(X_val_scaled)
        else:
            model.fit(X_train, y_train)
            preds = model.predict(X_val)
            
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        results[name] = rmse
        print(f"{name}: {rmse:.4f}")
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_model_name = name
            
    print("-"*40)
    print(f"Best Model: {best_model_name} with RMSE: {best_rmse:.4f}")

## 5. Train on Full Data and Submission

In [None]:
if 'best_model_name' in locals():
    final_model = models[best_model_name]
    
    print(f"Retraining {best_model_name} on full dataset...")
    
    # Scaling for linear models check
    if best_model_name in ['Linear Regression', 'Ridge', 'Lasso']:
        scaler_full = StandardScaler()
        X_train_final = scaler_full.fit_transform(X_train_full)
        X_test_final = scaler_full.transform(X_test_full)
        final_model.fit(X_train_final, y_train_full)
        final_predictions_log = final_model.predict(X_test_final)
    else:
        final_model.fit(X_train_full, y_train_full)
        final_predictions_log = final_model.predict(X_test_full)
        
    # Inverse transform (exp) to get actual prices
    final_predictions = np.expm1(final_predictions_log)
    
    # Submission CSV
    submission = pd.DataFrame({
        'Id': test_ids,
        'SalePrice': final_predictions
    })
    
    # Ensure directory exists
    import os
    if not os.path.exists('../outputs'):
        os.makedirs('../outputs')
    
    submission_path = '../outputs/submission.csv'
    submission.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")
    
    # Save Model
    if not os.path.exists('../models'):
        os.makedirs('../models')
        
    model_path = '../models/model.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"Model saved to {model_path}")