# Setup

In [None]:
from typing import Optional
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Creating the classes

### Data Explorer

In [None]:
class DataExplorer:
    """
    Utility class for basic data exploration and visualization.
    """

    @staticmethod
    def explore_data(data: pd.DataFrame) -> None:
        """
        Display head, description, and basic info of the dataset.

        Parameters:
            data (pd.DataFrame): The dataset to explore.
        """
        print(data.head().T)
        print(data.describe().T)
        data.info()

    @staticmethod
    def plot_histograms(data: pd.DataFrame) -> None:
        """
        Plot histograms for all numeric columns in the dataset.

        Parameters:
            data (pd.DataFrame): The dataset to visualize.
        """
        data.hist(bins=15, figsize=(15, 10))
        plt.show()

    @staticmethod
    def plot_correlation_matrix(data: pd.DataFrame) -> None:
        """
        Plot a heatmap showing the correlation between variables.

        Parameters:
            data (pd.DataFrame): The dataset to analyze.
        """
        plt.figure(figsize=(12, 8))
        sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm')
        plt.title("Feature Correlation Heatmap")
        plt.show()

### Wine Quality Model

In [None]:
class WineQualityModel:
    """
    A class to encapsulate the machine learning workflow for predicting wine quality.
    Includes data loading, preprocessing, training, evaluation, and cross-validation.
    """

    def __init__(self, filepath: str) -> None:
        """
        Initialize the WineQualityModel with the path to the dataset.

        Parameters:
            filepath (str): Path to the CSV dataset file.
        """
        self.filepath: str = filepath
        self.data: Optional[pd.DataFrame] = None
        self.X_train: Optional[pd.DataFrame] = None
        self.X_test: Optional[pd.DataFrame] = None
        self.y_train: Optional[pd.Series] = None
        self.y_test: Optional[pd.Series] = None
        self.model_pipeline: Pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', LogisticRegression(max_iter=1000))
        ])

    def load_data(self) -> 'WineQualityModel':
        """
        Load the dataset and perform basic validation.

        Returns:
            WineQualityModel: self
        """
        try:
            self.data = pd.read_csv(self.filepath)
        except FileNotFoundError as e:
            print(f"File not found: {e}")
            raise

        if self.data is None or self.data.empty:
            raise ValueError("Loaded data is empty.")
        
        if 'quality' not in self.data.columns:
            raise ValueError("Column 'quality' not found in dataset.")

        # Optionally perform data exploration here or externally
        return self

    def preprocess_data(self) -> 'WineQualityModel':
        """
        Split the dataset into train/test sets.

        Returns:
            WineQualityModel: self
        """
        X = self.data.drop('quality', axis=1)
        y = self.data['quality']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        return self

    def train_model(self) -> 'WineQualityModel':
        """
        Train the pipeline on training data.

        Returns:
            WineQualityModel: self
        """
        self.model_pipeline.fit(self.X_train, self.y_train)
        return self

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        """
        Predict the target for given features using the trained pipeline.

        Parameters:
            X (pd.DataFrame): Feature matrix.

        Returns:
            np.ndarray: Predicted labels.
        """
        return self.model_pipeline.predict(X)

    def evaluate_model(self) -> 'WineQualityModel':
        """
        Evaluate the trained model using the test data.

        Returns:
            WineQualityModel: self
        """
        y_pred: np.ndarray = self.predict(self.X_test)

        # Confusion Matrix
        cm: np.ndarray = confusion_matrix(self.y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(self.y_test))
        disp.plot(cmap='Blues')
        plt.title("Confusion Matrix")
        plt.show()

        # Classification Report
        print("Classification Report:")
        print(classification_report(self.y_test, y_pred))

        return self

    def cross_validate_model(self) -> 'WineQualityModel':
        """
        Perform k-fold cross-validation on training data and visualize accuracy.

        Returns:
            WineQualityModel: self
        """
        scores: np.ndarray = cross_val_score(self.model_pipeline, self.X_train, self.y_train, cv=5)
        print("Cross-validation scores:", scores)
        print("Average Accuracy:", np.mean(scores))

        # Plot results
        plt.figure(figsize=(6, 4))
        sns.barplot(x=list(range(1, 6)), y=scores)
        plt.title("Cross-validation Accuracy per Fold")
        plt.xlabel("Fold")
        plt.ylabel("Accuracy")
        plt.ylim(0, 1)
        plt.show()

        return self

    def run_all(self) -> 'WineQualityModel':
        """
        Run the full pipeline: load data, preprocess, train, evaluate, and validate.

        Returns:
            WineQualityModel: self
        """
        return self.load_data().preprocess_data().train_model().evaluate_model().cross_validate_model()


# Executing the code

In [None]:
path = '../../data/raw/wine_quality_df.csv'
model = WineQualityModel(path)
model.run_all()
