In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, 
                           roc_auc_score, roc_curve, accuracy_score,
                           precision_recall_curve, f1_score)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings('ignore')

# Set up plotting parameters
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
sns.set_palette("husl")


In [2]:
def setup_project_structure():
    """Create complete directory structure for ML project"""
    directories = [
        'data', 'data/raw', 'data/processed', 'data/external',
        'notebooks', 'src', 'models', 'reports', 'reports/figures', 'results'
    ]
    
    for directory in directories:
        Path(directory).mkdir(parents=True, exist_ok=True)
    
    print("✅ Project structure created successfully!")

def safe_save_file(data, filepath, **kwargs):
    """Safely save data with directory creation"""
    file_path = Path(filepath)
    file_path.parent.mkdir(parents=True, exist_ok=True)
    
    if hasattr(data, 'to_csv'):
        data.to_csv(filepath, **kwargs)
    else:
        # For other data types, implement as needed
        pass
    
    print(f"✅ Saved: {filepath}")

# Setup project structure
setup_project_structure()

# Load the housing dataset
try:
    # Try to load from different possible locations
    possible_paths = [
        'data/raw/train.csv',
        'train.csv',
        '../data/train.csv'
    ]
    
    train_data = None
    for path in possible_paths:
        if os.path.exists(path):
            train_data = pd.read_csv(path)
            print(f"✅ Dataset loaded from: {path}")
            break
    
    if train_data is None:
        print("⚠️  Dataset not found. Creating sample data for demonstration...")
        # Create sample housing data
        np.random.seed(42)
        n_samples = 1000
        
        train_data = pd.DataFrame({
            'Id': range(1, n_samples + 1),
            'OverallQual': np.random.randint(1, 11, n_samples),
            'GrLivArea': np.random.randint(800, 3000, n_samples),
            'GarageCars': np.random.randint(0, 4, n_samples),
            'TotalBsmtSF': np.random.randint(0, 2000, n_samples),
            'YearBuilt': np.random.randint(1950, 2021, n_samples),
            'Neighborhood': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
            'HouseStyle': np.random.choice(['1Story', '2Story', 'Split'], n_samples),
            'SalePrice': np.random.randint(50000, 500000, n_samples)
        })
        print("✅ Sample dataset created!")
    
    print(f"📊 Dataset shape: {train_data.shape}")
    print(f"📋 Columns: {list(train_data.columns)}")
    
except Exception as e:
    print(f" Error loading dataset: {e}")
    print("Please ensure you have the housing dataset in the correct location.")


✅ Project structure created successfully!
⚠️  Dataset not found. Creating sample data for demonstration...
✅ Sample dataset created!
📊 Dataset shape: (1000, 9)
📋 Columns: ['Id', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt', 'Neighborhood', 'HouseStyle', 'SalePrice']


In [3]:
print(" HOUSING PRICE CLASSIFICATION ANALYSIS")
print("="*60)

def create_price_categories(df, target_col='SalePrice'):
    """Convert regression problem to multi-class classification"""
    df_copy = df.copy()
    
    # Calculate quartiles for price categories
    q1 = df_copy[target_col].quantile(0.25)
    q2 = df_copy[target_col].quantile(0.50) 
    q3 = df_copy[target_col].quantile(0.75)
    
    def categorize_price(price):
        if price <= q1:
            return 0  # Low
        elif price <= q2:
            return 1  # Medium-Low
        elif price <= q3:
            return 2  # Medium-High
        else:
            return 3  # High
    
    df_copy['PriceCategory'] = df_copy[target_col].apply(categorize_price)
    
    print("💰 Price Categories Created:")
    print(f"   🔵 Low (0): <= ${q1:,.0f}")
    print(f"   🟢 Medium-Low (1): ${q1:,.0f} - ${q2:,.0f}")
    print(f"   🟡 Medium-High (2): ${q2:,.0f} - ${q3:,.0f}")
    print(f"   🔴 High (3): > ${q3:,.0f}")
    
    return df_copy

# Create classification dataset
classification_data = create_price_categories(train_data)

# Check class distribution
print(f"\n📈 Class Distribution:")
class_counts = classification_data['PriceCategory'].value_counts().sort_index()
for i, count in enumerate(class_counts):
    categories = ['Low', 'Medium-Low', 'Medium-High', 'High']
    percentage = (count / len(classification_data)) * 100
    print(f"   {categories[i]}: {count} samples ({percentage:.1f}%)")

def preprocess_classification_data(df):
    """Comprehensive data preprocessing for classification"""
    df_processed = df.copy()
    
    print(f"\n🔧 Preprocessing Data...")
    print(f"   Initial shape: {df_processed.shape}")
    
    # Handle missing values
    numerical_cols = df_processed.select_dtypes(include=[np.number]).columns
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    
    # Fill numerical missing values with median
    for col in numerical_cols:
        if df_processed[col].isnull().any():
            median_val = df_processed[col].median()
            df_processed[col].fillna(median_val, inplace=True)
            print(f"   ✅ Filled {col} missing values with median: {median_val}")
    
    # Fill categorical missing values with mode
    for col in categorical_cols:
        if df_processed[col].isnull().any():
            mode_val = df_processed[col].mode()[0]
            df_processed[col].fillna(mode_val, inplace=True)
            print(f"   ✅ Filled {col} missing values with mode: {mode_val}")
    
    # Encode categorical variables
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le
        print(f"   ✅ Encoded {col}: {len(le.classes_)} categories")
    
    print(f"   Final shape: {df_processed.shape}")
    return df_processed, label_encoders

# Preprocess the data
processed_data, encoders = preprocess_classification_data(classification_data)

# Prepare features and target
exclude_cols = ['SalePrice', 'PriceCategory', 'Id']
X = processed_data.drop([col for col in exclude_cols if col in processed_data.columns], axis=1)
y = processed_data['PriceCategory']

print(f"\n📊 Model Input Summary:")
print(f"   Features shape: {X.shape}")
print(f"   Target shape: {y.shape}")
print(f"   Feature columns: {list(X.columns)}")

 HOUSING PRICE CLASSIFICATION ANALYSIS
💰 Price Categories Created:
   🔵 Low (0): <= $167,382
   🟢 Medium-Low (1): $167,382 - $276,102
   🟡 Medium-High (2): $276,102 - $386,782
   🔴 High (3): > $386,782

📈 Class Distribution:
   Low: 250 samples (25.0%)
   Medium-Low: 250 samples (25.0%)
   Medium-High: 250 samples (25.0%)
   High: 250 samples (25.0%)

🔧 Preprocessing Data...
   Initial shape: (1000, 10)
   ✅ Encoded Neighborhood: 4 categories
   ✅ Encoded HouseStyle: 3 categories
   Final shape: (1000, 10)

📊 Model Input Summary:
   Features shape: (1000, 7)
   Target shape: (1000,)
   Feature columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt', 'Neighborhood', 'HouseStyle']


In [4]:
from sklearn.utils.class_weight import compute_class_weight

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("📊 Data Split Summary:")
print(f"   Training set: {X_train.shape[0]} samples")
print(f"   Testing set: {X_test.shape[0]} samples")

# Check class distribution in splits
print(f"\n📈 Training Set Class Distribution:")
train_dist = y_train.value_counts().sort_index()
for i, count in enumerate(train_dist):
    categories = ['Low', 'Medium-Low', 'Medium-High', 'High']
    percentage = (count / len(y_train)) * 100
    print(f"   {categories[i]}: {count} ({percentage:.1f}%)")

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n🔧 Feature Scaling Applied:")
print(f"   Mean of first feature after scaling: {X_train_scaled[:, 0].mean():.3f}")
print(f"   Std of first feature after scaling: {X_train_scaled[:, 0].std():.3f}")

# Feature Selection (optional but recommended)
print(f"\n🎯 Feature Selection:")
feature_selector = SelectKBest(score_func=f_classif, k=min(15, X_train.shape[1]))
X_train_selected = feature_selector.fit_transform(X_train_scaled, y_train)
X_test_selected = feature_selector.transform(X_test_scaled)

selected_features = X.columns[feature_selector.get_support()]
print(f"   Selected {X_train_selected.shape[1]} best features:")
for i, feature in enumerate(selected_features):
    score = feature_selector.scores_[feature_selector.get_support()][i]
    print(f"   • {feature}: {score:.2f}")

# Compute class weights for balanced training
class_weights = compute_class_weight(
    'balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
print(f"\n⚖️  Class Weights (for balanced training):")
for class_label, weight in class_weight_dict.items():
    categories = ['Low', 'Medium-Low', 'Medium-High', 'High']
    print(f"   {categories[class_label]}: {weight:.3f}")


📊 Data Split Summary:
   Training set: 800 samples
   Testing set: 200 samples

📈 Training Set Class Distribution:
   Low: 200 (25.0%)
   Medium-Low: 200 (25.0%)
   Medium-High: 200 (25.0%)
   High: 200 (25.0%)

🔧 Feature Scaling Applied:
   Mean of first feature after scaling: 0.000
   Std of first feature after scaling: 1.000

🎯 Feature Selection:
   Selected 7 best features:
   • OverallQual: 1.37
   • GrLivArea: 0.96
   • GarageCars: 0.19
   • TotalBsmtSF: 1.96
   • YearBuilt: 1.30
   • Neighborhood: 3.02
   • HouseStyle: 1.01

⚖️  Class Weights (for balanced training):
   Low: 1.000
   Medium-Low: 1.000
   Medium-High: 1.000
   High: 1.000


In [5]:
print("\n" + "="*60)
print("🧠 BAYESIAN LOGISTIC REGRESSION")
print("="*60)

# Bayesian Logistic Regression using Regularized Logistic Regression
# (L2 regularization approximates Bayesian inference with Gaussian priors)

print(f"\n🔍 Hyperparameter Tuning for Logistic Regression...")

# Define parameter grid for Bayesian-like regularization
log_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Regularization strength
    'penalty': ['l2'],  # L2 regularization (Bayesian-like with Gaussian prior)
    'solver': ['lbfgs'],  # Good solver for multiclass problems
    'max_iter': [1000],  # Ensure convergence
    'class_weight': [None, 'balanced']  # Handle class imbalance
}

# Grid search with cross-validation
log_grid_search = GridSearchCV(
    LogisticRegression(random_state=42, multi_class='ovr'),
    log_param_grid, 
    cv=5, 
    scoring='f1_macro',  # Good for multiclass problems
    n_jobs=-1,
    verbose=0
)

print("   🏃‍♂️ Training models with different hyperparameters...")
log_grid_search.fit(X_train_selected, y_train)

# Best model
best_bayesian_lr = log_grid_search.best_estimator_

print(f"✅ Best Parameters Found:")
for param, value in log_grid_search.best_params_.items():
    print(f"   • {param}: {value}")

print(f"📊 Best Cross-Validation Score: {log_grid_search.best_score_:.4f}")

# Train and evaluate the best model
y_pred_lr = best_bayesian_lr.predict(X_test_selected)
y_pred_proba_lr = best_bayesian_lr.predict_proba(X_test_selected)

# Calculate metrics
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr, average='macro')
lr_cv_score = cross_val_score(best_bayesian_lr, X_train_selected, y_train, cv=5, scoring='accuracy').mean()

print(f"\n📈 Bayesian Logistic Regression Results:")
print(f"   • Test Accuracy: {lr_accuracy:.4f}")
print(f"   • F1-Score (Macro): {lr_f1:.4f}")  
print(f"   • Cross-Validation Score: {lr_cv_score:.4f}")

print(f"\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_pred_lr, 
                          target_names=['Low', 'Medium-Low', 'Medium-High', 'High']))

# Feature importance (coefficients)
feature_importance_lr = np.abs(best_bayesian_lr.coef_).mean(axis=0)
feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': feature_importance_lr
}).sort_values('Importance', ascending=False)

print(f"\n🎯 Top 5 Most Important Features:")
for idx, row in feature_importance_df.head().iterrows():
    print(f"   • {row['Feature']}: {row['Importance']:.4f}")



🧠 BAYESIAN LOGISTIC REGRESSION

🔍 Hyperparameter Tuning for Logistic Regression...
   🏃‍♂️ Training models with different hyperparameters...
✅ Best Parameters Found:
   • C: 0.001
   • class_weight: balanced
   • max_iter: 1000
   • penalty: l2
   • solver: lbfgs
📊 Best Cross-Validation Score: 0.2618

📈 Bayesian Logistic Regression Results:
   • Test Accuracy: 0.2850
   • F1-Score (Macro): 0.2848
   • Cross-Validation Score: 0.2637

📋 Detailed Classification Report:
              precision    recall  f1-score   support

         Low       0.21      0.26      0.23        50
  Medium-Low       0.37      0.40      0.38        50
 Medium-High       0.31      0.26      0.28        50
        High       0.27      0.22      0.24        50

    accuracy                           0.28       200
   macro avg       0.29      0.29      0.28       200
weighted avg       0.29      0.28      0.28       200


🎯 Top 5 Most Important Features:
   • Neighborhood: 0.0196
   • TotalBsmtSF: 0.0173
   • Yea

In [None]:
print("\n" + "="*60)
print("🤖 SUPPORT VECTOR MACHINE CLASSIFICATION")
print("="*60)

print(f"\n🔍 Hyperparameter Tuning for SVM...")

# Define parameter grid for SVM
svm_param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Kernel coefficient
    'kernel': ['rbf', 'poly', 'linear'],  # Different kernel types
    'class_weight': [None, 'balanced']  # Handle class imbalance
}

# Grid search with cross-validation (reduced CV for faster execution)
svm_grid_search = GridSearchCV(
    SVC(random_state=42, probability=True),  # probability=True for ROC curves
    svm_param_grid, 
    cv=3,  # Reduced CV folds for faster execution
    scoring='f1_macro',
    n_jobs=-1,
    verbose=0
)

print("   🏃‍♂️ Training SVM models (this may take a few minutes)...")
svm_grid_search.fit(X_train_selected, y_train)

# Best SVM model
best_svm = svm_grid_search.best_estimator_

print(f"✅ Best SVM Parameters Found:")
for param, value in svm_grid_search.best_params_.items():
    print(f"   • {param}: {value}")

print(f"📊 Best Cross-Validation Score: {svm_grid_search.best_score_:.4f}")

# Train and evaluate the best SVM model
y_pred_svm = best_svm.predict(X_test_selected)
y_pred_proba_svm = best_svm.predict_proba(X_test_selected)

# Calculate metrics
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm, average='macro')
svm_cv_score = cross_val_score(best_svm, X_train_selected, y_train, cv=5, scoring='accuracy').mean()

print(f"\n📈 Support Vector Machine Results:")
print(f"   • Test Accuracy: {svm_accuracy:.4f}")
print(f"   • F1-Score (Macro): {svm_f1:.4f}")
print(f"   • Cross-Validation Score: {svm_cv_score:.4f}")

print(f"\n📋 Detailed Classification Report:")
print(classification_report(y_test, y_pred_svm,
                          target_names=['Low', 'Medium-Low', 'Medium-High', 'High']))

print(f"\n🔧 SVM Model Details:")
print(f"   • Kernel: {best_svm.kernel}")
print(f"   • Number of Support Vectors: {best_svm.n_support_}")
print(f"   • Support Vector Ratio: {best_svm.n_support_.sum() / len(X_train_selected):.3f}")



🤖 SUPPORT VECTOR MACHINE CLASSIFICATION

🔍 Hyperparameter Tuning for SVM...
   🏃‍♂️ Training SVM models (this may take a few minutes)...


In [None]:
print("\n" + "="*60)
print("📊 MODEL COMPARISON SUMMARY")
print("="*60)

# Compile results
models_results = {
    'Model': ['Bayesian Logistic Regression', 'Support Vector Machine'],
    'Test_Accuracy': [lr_accuracy, svm_accuracy],
    'F1_Score_Macro': [lr_f1, svm_f1],
    'CV_Score': [lr_cv_score, svm_cv_score],
    'Best_Parameters': [
        str(log_grid_search.best_params_),
        str(svm_grid_search.best_params_)
    ]
}

comparison_df = pd.DataFrame(models_results)

print(f"🏆 Performance Comparison:")
print(comparison_df[['Model', 'Test_Accuracy', 'F1_Score_Macro', 'CV_Score']].to_string(index=False))

# Find best model
best_model_idx = comparison_df['F1_Score_Macro'].idxmax()
best_model_name = comparison_df.iloc[best_model_idx]['Model']
best_f1_score = comparison_df.iloc[best_model_idx]['F1_Score_Macro']

print(f"\n🥇 Best Performing Model: {best_model_name}")
print(f"   📈 F1-Score: {best_f1_score:.4f}")

# Save results
safe_save_file(comparison_df, 'data/processed/classification_model_comparison.csv', index=False)

# Create detailed results dictionary for plotting
results_dict = {
    'Bayesian_LR': {
        'model': best_bayesian_lr,
        'predictions': y_pred_lr,
        'probabilities': y_pred_proba_lr,
        'accuracy': lr_accuracy,
        'f1': lr_f1
    },
    'SVM': {
        'model': best_svm,
        'predictions': y_pred_svm,
        'probabilities': y_pred_proba_svm,
        'accuracy': svm_accuracy,
        'f1': svm_f1
    }
}

print(f"\n✅ Results saved to: data/processed/classification_model_comparison.csv")
print(f"📁 Models trained and ready for visualization!")