# Machine Learning Regression Comparison

This notebook implements and compares different regression algorithms on housing price data:
1. Random Forest Regressor
2. XGBoost
3. CatBoost
4. Artificial Neural Network (Keras)
5. Linear Regression

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úì All libraries imported successfully!")

ModuleNotFoundError: No module named 'tensorflow'

## 2. Load and Prepare Data

In [None]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

# Separate target variable
y = train_df['SalePrice']

# Select numerical features excluding 'Id' and 'SalePrice'
numerical_features = train_df.select_dtypes(include=np.number).columns.tolist()
numerical_features = [feature for feature in numerical_features if feature not in ['Id', 'SalePrice']]

# Prepare feature sets
X = train_df[numerical_features].copy()
X_test_full = test_df[numerical_features].copy()

# Handle missing values - fill with mean
for col in X.columns:
    if X[col].isnull().sum() > 0:
        mean_val = X[col].mean()
        X[col] = X[col].fillna(mean_val)
        X_test_full[col] = X_test_full[col].fillna(mean_val)

print(f"\nFeatures selected: {len(numerical_features)}")

## 3. Split and Scale Data

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

## 4. Train Models

### 4.1 Random Forest Regressor

In [None]:
print("Training Random Forest Regressor...")

rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)

# Predictions
rf_pred_train = rf_model.predict(X_train)
rf_pred_val = rf_model.predict(X_val)

# Calculate metrics
rf_train_rmse = np.sqrt(mean_squared_error(y_train, rf_pred_train))
rf_val_rmse = np.sqrt(mean_squared_error(y_val, rf_pred_val))
rf_train_r2 = r2_score(y_train, rf_pred_train)
rf_val_r2 = r2_score(y_val, rf_pred_val)

print(f"\n‚úì Random Forest Results:")
print(f"  Train RMSE: ${rf_train_rmse:,.2f}")
print(f"  Val RMSE: ${rf_val_rmse:,.2f}")
print(f"  Train R¬≤: {rf_train_r2:.4f}")
print(f"  Val R¬≤: {rf_val_r2:.4f}")

### 4.2 XGBoost

In [None]:
print("Training XGBoost...")

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Predictions
xgb_pred_train = xgb_model.predict(X_train)
xgb_pred_val = xgb_model.predict(X_val)

# Calculate metrics
xgb_train_rmse = np.sqrt(mean_squared_error(y_train, xgb_pred_train))
xgb_val_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred_val))
xgb_train_r2 = r2_score(y_train, xgb_pred_train)
xgb_val_r2 = r2_score(y_val, xgb_pred_val)

print(f"\n‚úì XGBoost Results:")
print(f"  Train RMSE: ${xgb_train_rmse:,.2f}")
print(f"  Val RMSE: ${xgb_val_rmse:,.2f}")
print(f"  Train R¬≤: {xgb_train_r2:.4f}")
print(f"  Val R¬≤: {xgb_val_r2:.4f}")

### 4.3 CatBoost

In [None]:
print("Training CatBoost...")

cb_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    verbose=0
)

cb_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    early_stopping_rounds=50,
    verbose=False
)

# Predictions
cb_pred_train = cb_model.predict(X_train)
cb_pred_val = cb_model.predict(X_val)

# Calculate metrics
cb_train_rmse = np.sqrt(mean_squared_error(y_train, cb_pred_train))
cb_val_rmse = np.sqrt(mean_squared_error(y_val, cb_pred_val))
cb_train_r2 = r2_score(y_train, cb_pred_train)
cb_val_r2 = r2_score(y_val, cb_pred_val)

print(f"\n‚úì CatBoost Results:")
print(f"  Train RMSE: ${cb_train_rmse:,.2f}")
print(f"  Val RMSE: ${cb_val_rmse:,.2f}")
print(f"  Train R¬≤: {cb_train_r2:.4f}")
print(f"  Val R¬≤: {cb_val_r2:.4f}")

### 4.4 Artificial Neural Network (Keras)

In [None]:
print("Training Keras ANN...")

# Build model
ann_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

ann_model.compile(optimizer=Adam(learning_rate=0.01), loss='mean_squared_error')

# Train
history = ann_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    verbose=0
)

# Predictions
ann_pred_train = ann_model.predict(X_train_scaled).flatten()
ann_pred_val = ann_model.predict(X_val_scaled).flatten()

# Calculate metrics
ann_train_rmse = np.sqrt(mean_squared_error(y_train, ann_pred_train))
ann_val_rmse = np.sqrt(mean_squared_error(y_val, ann_pred_val))
ann_train_r2 = r2_score(y_train, ann_pred_train)
ann_val_r2 = r2_score(y_val, ann_pred_val)

print(f"\n‚úì Keras ANN Results:")
print(f"  Train RMSE: ${ann_train_rmse:,.2f}")
print(f"  Val RMSE: ${ann_val_rmse:,.2f}")
print(f"  Train R¬≤: {ann_train_r2:.4f}")
print(f"  Val R¬≤: {ann_val_r2:.4f}")

### 4.5 Linear Regression

In [None]:
print("Training Linear Regression...")

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predictions
lr_pred_train = lr_model.predict(X_train)
lr_pred_val = lr_model.predict(X_val)

# Calculate metrics
lr_train_rmse = np.sqrt(mean_squared_error(y_train, lr_pred_train))
lr_val_rmse = np.sqrt(mean_squared_error(y_val, lr_pred_val))
lr_train_r2 = r2_score(y_train, lr_pred_train)
lr_val_r2 = r2_score(y_val, lr_pred_val)

print(f"\n‚úì Linear Regression Results:")
print(f"  Train RMSE: ${lr_train_rmse:,.2f}")
print(f"  Val RMSE: ${lr_val_rmse:,.2f}")
print(f"  Train R¬≤: {lr_train_r2:.4f}")
print(f"  Val R¬≤: {lr_val_r2:.4f}")

## 5. Model Comparison

In [None]:
# Create comparison DataFrame
comparison_data = [
    {
        'Model': 'Random Forest',
        'Train RMSE': rf_train_rmse,
        'Val RMSE': rf_val_rmse,
        'Train MAE': mean_absolute_error(y_train, rf_pred_train),
        'Val MAE': mean_absolute_error(y_val, rf_pred_val),
        'Train R2': rf_train_r2,
        'Val R2': rf_val_r2
    },
    {
        'Model': 'XGBoost',
        'Train RMSE': xgb_train_rmse,
        'Val RMSE': xgb_val_rmse,
        'Train MAE': mean_absolute_error(y_train, xgb_pred_train),
        'Val MAE': mean_absolute_error(y_val, xgb_pred_val),
        'Train R2': xgb_train_r2,
        'Val R2': xgb_val_r2
    },
    {
        'Model': 'CatBoost',
        'Train RMSE': cb_train_rmse,
        'Val RMSE': cb_val_rmse,
        'Train MAE': mean_absolute_error(y_train, cb_pred_train),
        'Val MAE': mean_absolute_error(y_val, cb_pred_val),
        'Train R2': cb_train_r2,
        'Val R2': cb_val_r2
    },
    {
        'Model': 'Keras ANN',
        'Train RMSE': ann_train_rmse,
        'Val RMSE': ann_val_rmse,
        'Train MAE': mean_absolute_error(y_train, ann_pred_train),
        'Val MAE': mean_absolute_error(y_val, ann_pred_val),
        'Train R2': ann_train_r2,
        'Val R2': ann_val_r2
    },
    {
        'Model': 'Linear Regression',
        'Train RMSE': lr_train_rmse,
        'Val RMSE': lr_val_rmse,
        'Train MAE': mean_absolute_error(y_train, lr_pred_train),
        'Val MAE': mean_absolute_error(y_val, lr_pred_val),
        'Train R2': lr_train_r2,
        'Val R2': lr_val_r2
    }
]

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.sort_values('Val RMSE')

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(df_comparison.to_string(index=False))

# Find best model
best_model_idx = df_comparison['Val RMSE'].idxmin()
best_model = df_comparison.loc[best_model_idx, 'Model']
best_rmse = df_comparison.loc[best_model_idx, 'Val RMSE']
best_r2 = df_comparison.loc[best_model_idx, 'Val R2']

print("\n" + "="*80)
print(f"üèÜ BEST MODEL: {best_model}")
print(f"   Validation RMSE: ${best_rmse:,.2f}")
print(f"   Validation R¬≤ Score: {best_r2:.4f}")
print("="*80)

## 6. Visualize Model Performance

In [None]:
# plot 1: RMSE comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='Val RMSE', data=df_comparison, palette='viridis')
plt.title('Validation RMSE (Lower is Better)')
plt.xlabel('Model')
plt.ylabel('RMSE ($)')
plt.xticks(rotation=45)
plt.show()

## 7. Feature Importance Analysis

In [None]:
# Print top 10 features for each model
print("\n" + "="*80)
print("TOP 10 MOST IMPORTANT FEATURES")
print("="*80)

models_dict = {
    'Random Forest': rf_model,
    'XGBoost': xgb_model,
    'CatBoost': cb_model
}

for model_name, model in models_dict.items():
    importance = model.feature_importances_
    importance_df = pd.DataFrame({
        'Feature': numerical_features,
        'Importance': importance
    }).sort_values('Importance', ascending=False).head(10)
    
    print(f"\n{model_name}:")
    for i, row in importance_df.iterrows():
        print(f"  {row['Feature']:30s}: {row['Importance']:.4f}")

## 8. Summary Table

In [None]:
# Display nice summary table
summary_df = df_comparison.copy()
summary_df['Val RMSE'] = summary_df['Val RMSE'].apply(lambda x: f"${x:,.2f}")
summary_df['Train RMSE'] = summary_df['Train RMSE'].apply(lambda x: f"${x:,.2f}")
summary_df['Val MAE'] = summary_df['Val MAE'].apply(lambda x: f"${x:,.2f}")
summary_df['Train MAE'] = summary_df['Train MAE'].apply(lambda x: f"${x:,.2f}")
summary_df['Val R2'] = summary_df['Val R2'].apply(lambda x: f"{x:.4f}")
summary_df['Train R2'] = summary_df['Train R2'].apply(lambda x: f"{x:.4f}")
print(summary_df)
