In [None]:
# OOP Based EDA & ML Pipeline for Housing Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lazypredict.Supervised import LazyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestRegressor

# ======================== #
# 1️ Data Loading & Overview
# ======================== #
class DataLoader:
    def __init__(self, file_path):
        self.file_path = file_path
        self.df = None

    def load_data(self):
        self.df = pd.read_csv(self.file_path)
        print("Data Loaded Successfully!\n")
        return self.df

    def show_info(self):
        print("\nDataset Info:")
        return self.df.info()

    def show_head(self, n=5):
        print(f"\nFirst {n} Rows:")
        return self.df.head(n)

    def show_tail(self, n=5):
        print(f"\nLast {n} Rows:")
        return self.df.tail(n)

    def describe_data(self):
        print("\nStatistical Summary:")
        return self.df.describe()


In [None]:
# ======================== #
# 2️ Generalized Data Cleaner
# ======================== #

class DataCleaner:
    def __init__(self, df: pd.DataFrame):
        self.df = df

    # Check Missing Values
    def check_missing(self):
        print("\nMissing Values in Each Column:")
        missing = self.df.isnull().sum()
        print(missing[missing > 0] if missing.sum() > 0 else "No missing values found.")
        return missing

    # Fill Missing Values (auto handle both numeric + categorical)
    def fill_missing(self):
        print("\nHandling missing values automatically...")

        for col in self.df.columns:
            if self.df[col].isnull().sum() > 0:
                if pd.api.types.is_numeric_dtype(self.df[col]):
                    self.df[col].fillna(self.df[col].median(), inplace=True)
                    print(f"   • '{col}' → filled with median ({self.df[col].median():.2f})")
                else:
                    self.df[col].fillna(self.df[col].mode()[0], inplace=True)
                    print(f"   • '{col}' → filled with mode ('{self.df[col].mode()[0]}')")

        print("All missing values handled successfully.")
        return self.df

    # Remove Duplicate Rows
    def remove_duplicates(self):
        before = len(self.df)
        self.df.drop_duplicates(inplace=True)
        after = len(self.df)
        print(f"\nRemoved {before - after} duplicate rows.")
        return self.df

    # Remove Outliers (IQR Method)
    def remove_outliers(self):
        print("\nRemoving outliers using IQR method...")
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns

        for col in numeric_cols:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR

            before = len(self.df)
            self.df = self.df[(self.df[col] >= lower) & (self.df[col] <= upper)]
            after = len(self.df)
            if before != after:
                print(f"   • '{col}' → Removed {before - after} outliers.")

        print("Outlier removal complete.")
        return self.df

    # Smooth Noisy Data (Rolling Mean)
    def smooth_noisy_data(self, window=3):
        print(f"\nApplying rolling mean smoothing (window={window})...")
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            self.df[col] = self.df[col].rolling(window=window, min_periods=1).mean()
        print("Noise reduction applied successfully.")
        return self.df


In [None]:
# ======================== #
# 3️ Data Analysis
# ======================== #
class DataAnalyzer:
    def __init__(self, df):
        self.df = df

    def correlation_matrix(self):
        print("\nCorrelation Matrix:")
        numeric_df = self.df.select_dtypes(include=[np.number])
        corr = numeric_df.corr()
        print(corr)
        return corr

    def highest_correlations(self, target, top_n=5):
        numeric_df = self.df.select_dtypes(include=[np.number])
        if target not in numeric_df.columns:
            print(f"'{target}' is not numeric! Convert or encode it before correlation.")
            return None

        corr = numeric_df.corr()[target].abs().sort_values(ascending=False)
        print(f"\nTop {top_n} correlated features with '{target}':")
        print(corr.head(top_n))
        return corr.head(top_n)



In [None]:
# ======================== #
# 4️ Data Visualization
# ======================== #
class DataVisualizer:
    def __init__(self, df):
        self.df = df

    def plot_distributions(self):
        self.df.hist(figsize=(12, 8), bins=20)
        plt.suptitle("Feature Distributions", fontsize=16)
        plt.show()

    def correlation_heatmap(self):
        plt.figure(figsize=(10, 6))
        sns.heatmap(self.df.select_dtypes(include='number').corr(), annot=True, cmap="coolwarm")
        plt.title("Correlation Heatmap")
        plt.show()

    def ocean_proximity_count(self):
        plt.figure(figsize=(7, 5))
        sns.countplot(x='ocean_proximity', data=self.df)
        plt.title("Ocean Proximity Count")
        plt.show()

    def income_vs_value(self):
        plt.figure(figsize=(8,6))
        sns.scatterplot(x='median_income', y='median_house_value', data=self.df, alpha=0.5)
        plt.title("Income vs House Value")
        plt.xlabel("Median Income")
        plt.ylabel("Median House Value")
        plt.show()

    def boxplot_features(self):
        numeric_cols = self.df.select_dtypes(include='number').columns
        plt.figure(figsize=(15,8))
        self.df[numeric_cols].boxplot()
        plt.title("Boxplot of Numeric Features (Outlier Detection)")
        plt.xticks(rotation=45)
        plt.show()

    def geo_distribution(self):
        plt.figure(figsize=(8,6))
        sns.scatterplot(x='longitude', y='latitude', hue='median_house_value', data=self.df, palette='coolwarm', alpha=0.6)
        plt.title("Geographical Distribution of House Prices")
        plt.show()

    def avg_price_per_ocean(self):
        avg_prices = self.df.groupby('ocean_proximity')['median_house_value'].mean().sort_values()
        plt.figure(figsize=(7,5))
        sns.barplot(x=avg_prices.index, y=avg_prices.values, palette='viridis')
        plt.title("Average House Value by Ocean Proximity")
        plt.xlabel("Ocean Proximity")
        plt.ylabel("Average House Value")
        plt.show()


In [None]:
# ======================== #
# 5️ Data Preprocessing
# ======================== #
class DataPreprocessor:
    def __init__(self, df):
        self.df = df.copy()

    def encode_and_split(self, target):
        print("\nEncoding categorical features...")
        le = LabelEncoder()
        self.df['ocean_proximity'] = le.fit_transform(self.df['ocean_proximity'])

        X = self.df.drop(columns=[target])
        y = self.df[target]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print("Data split into train and test sets.")
        return X_train, X_test, y_train, y_test


In [None]:
# ======================== #
# 6️ Model Selection
# ======================== #
class ModelSelector:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def compare_models(self):
        print("\nComparing Models with LazyPredict...")
        reg = LazyRegressor(verbose=0, ignore_warnings=True)
        models, predictions = reg.fit(self.X_train, self.X_test, self.y_train, self.y_test)
        print(models)
        return models


In [None]:
# ======================== #
# 7️ Model Training
# ======================== #
class ModelTrainer:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def train_best_model(self):
        print("\nTraining RandomForestRegressor as best model...")
        model = RandomForestRegressor(random_state=42)
        model.fit(self.X_train, self.y_train)
        y_pred = model.predict(self.X_test)
        print("Model trained successfully.")
        return model, y_pred


In [None]:
# ======================== #
# 8️ Model Evaluation
# ======================== #
class ModelEvaluator:
    def __init__(self, y_test, y_pred):
        self.y_test = y_test
        self.y_pred = y_pred

    def evaluate(self):
        print("\nModel Performance:")
        print(f"MAE: {mean_absolute_error(self.y_test, self.y_pred):.2f}")
        print(f"MSE: {mean_squared_error(self.y_test, self.y_pred):.2f}")
        print(f"R² Score: {r2_score(self.y_test, self.y_pred):.2f}")

    def plot_confusion_matrix(self):
        y_true = np.round(self.y_test / 50000).astype(int)
        y_pred = np.round(self.y_pred / 50000).astype(int)
        cm = confusion_matrix(y_true, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(cmap="Blues")
        plt.title("Confusion Matrix (Binned Target)")
        plt.show()




In [None]:
# ======================== #
#  Execution Pipeline
# ======================== #

if __name__ == "__main__":
    file_path = "housing.csv"   # <-- apna dataset ka path yahan daalein
    target_col = "median_house_value"

    # 1️ Data Loading
    loader = DataLoader(file_path)
    df = loader.load_data()
    loader.show_info()
    print(loader.show_head())

    # 2️ Data Cleaning
    cleaner = DataCleaner(df)
    cleaner.check_missing()
    df = cleaner.fill_missing()
    df = cleaner.remove_duplicates()
    df = cleaner.remove_outliers()
    df = cleaner.smooth_noisy_data(window=3)

    # 3️ Data Analysis
    analyzer = DataAnalyzer(df)
    analyzer.correlation_matrix()
    analyzer.highest_correlations(target_col, top_n=5)

    # 4️ Data Visualization
    viz = DataVisualizer(df)
    viz.plot_distributions()
    viz.correlation_heatmap()
    viz.ocean_proximity_count()
    viz.income_vs_value()
    viz.boxplot_features()
    viz.geo_distribution()
    viz.avg_price_per_ocean()

    # 5️ Data Preprocessing
    pre = DataPreprocessor(df)
    X_train, X_test, y_train, y_test = pre.encode_and_split(target_col)

    # 6️ Model Selection
    selector = ModelSelector(X_train, X_test, y_train, y_test)
    models = selector.compare_models()

    # 7️ Model Training
    trainer = ModelTrainer(X_train, X_test, y_train, y_test)
    model, y_pred = trainer.train_best_model()

    # 8️ Model Evaluation
    evaluator = ModelEvaluator(y_test, y_pred)
    evaluator.evaluate()
    evaluator.plot_confusion_matrix()

    print("\nFull EDA & ML Pipeline executed successfully!")
