In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Load the dataset
print("Loading wine quality dataset...")
try:
    import kagglehub
    path = kagglehub.dataset_download("yasserh/wine-quality-dataset")
    df = pd.read_csv(f"{path}/WineQT.csv")
    print("✓ Dataset loaded from Kaggle")
except:
    try:
        df = pd.read_csv("WineQT.csv")
        print("✓ Dataset loaded from local file")
    except:
        print("⚠ Could not load dataset. Please ensure WineQT.csv is in the current directory")
        print("  or install kagglehub: pip install kagglehub")
        exit(1)

In [15]:
class KNNFromScratch:
    """
    K-Nearest Neighbors classifier implemented from scratch
    """
    def __init__(self, k=5):
        """
        Initialize KNN classifier
        
        Parameters:
        k: number of neighbors to consider
        """
        self.k = k
        self.X_train = None
        self.y_train = None
        
    def fit(self, X_train, y_train):
        """
        Store training data
        
        Parameters:
        X_train: Training features (n_samples, n_features)
        y_train: Training labels (n_samples,)
        """
        self.X_train = X_train
        self.y_train = y_train
        
    def euclidean_distance(self, x1, x2):
        """
        Calculate Euclidean distance between two points
        
        Parameters:
        x1: First point
        x2: Second point
        
        Returns:
        distance: Euclidean distance
        """
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    def manhattan_distance(self, x1, x2):
        """
        Calculate Manhattan distance between two points
        
        Parameters:
        x1: First point
        x2: Second point
        
        Returns:
        distance: Manhattan distance
        """
        return np.sum(np.abs(x1 - x2))
    
    def predict_single(self, x_test):
        """
        Predict class for a single test point
        
        Parameters:
        x_test: Test point (n_features,)
        
        Returns:
        prediction: Predicted class
        """
        # Calculate distances to all training points
        distances = []
        for i, x_train in enumerate(self.X_train):
            dist = self.euclidean_distance(x_test, x_train)
            distances.append((dist, self.y_train[i]))
        
        # Sort by distance and get k nearest neighbors
        distances.sort(key=lambda x: x[0])
        k_nearest = distances[:self.k]
        
        # Get the labels of k nearest neighbors
        k_nearest_labels = [label for _, label in k_nearest]
        
        # Vote for the most common class
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    def predict(self, X_test):
        """
        Predict classes for multiple test points
        
        Parameters:
        X_test: Test features (n_samples, n_features)
        
        Returns:
        predictions: Array of predictions
        """
        predictions = []
        for x_test in X_test:
            predictions.append(self.predict_single(x_test))
        return np.array(predictions)
    
    def predict_proba(self, X_test):
        """
        Predict class probabilities for test points
        
        Parameters:
        X_test: Test features (n_samples, n_features)
        
        Returns:
        probabilities: Array of class probabilities
        """
        unique_classes = np.unique(self.y_train)
        probabilities = []
        
        for x_test in X_test:
            # Calculate distances to all training points
            distances = []
            for i, x_train in enumerate(self.X_train):
                dist = self.euclidean_distance(x_test, x_train)
                distances.append((dist, self.y_train[i]))
            
            # Sort by distance and get k nearest neighbors
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]
            
            # Count occurrences of each class
            k_nearest_labels = [label for _, label in k_nearest]
            class_counts = Counter(k_nearest_labels)
            
            # Calculate probabilities
            probs = []
            for cls in unique_classes:
                probs.append(class_counts.get(cls, 0) / self.k)
            probabilities.append(probs)
            
        return np.array(probabilities)


class LogisticRegressionFromScratch:
    """
    Logistic Regression classifier implemented from scratch using gradient descent
    For multi-class classification using One-vs-Rest approach
    """
    def __init__(self, learning_rate=0.01, n_iterations=1000, regularization=None, lambda_reg=0.01):
        """
        Initialize Logistic Regression classifier
        
        Parameters:
        learning_rate: Step size for gradient descent
        n_iterations: Number of training iterations
        regularization: Type of regularization ('l2', 'l1', or None)
        lambda_reg: Regularization strength
        """
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.regularization = regularization
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None
        self.classes = None
        self.classifiers = {}
        self.cost_history = []
        
    def sigmoid(self, z):
        """
        Sigmoid activation function
        
        Parameters:
        z: Input value or array
        
        Returns:
        sigmoid(z): Value between 0 and 1
        """
        # Clip values to prevent overflow
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))
    
    def cost_function(self, X, y, weights, bias):
        """
        Calculate binary cross-entropy cost
        
        Parameters:
        X: Features (n_samples, n_features)
        y: Binary labels (n_samples,)
        weights: Model weights
        bias: Model bias
        
        Returns:
        cost: Cross-entropy cost
        """
        n_samples = X.shape[0]
        
        # Calculate predictions
        z = np.dot(X, weights) + bias
        predictions = self.sigmoid(z)
        
        # Clip predictions to prevent log(0)
        epsilon = 1e-7
        predictions = np.clip(predictions, epsilon, 1 - epsilon)
        
        # Binary cross-entropy cost
        cost = -(1/n_samples) * np.sum(
            y * np.log(predictions) + (1 - y) * np.log(1 - predictions)
        )
        
        # Add regularization term
        if self.regularization == 'l2':
            cost += (self.lambda_reg / (2 * n_samples)) * np.sum(weights ** 2)
        elif self.regularization == 'l1':
            cost += (self.lambda_reg / n_samples) * np.sum(np.abs(weights))
            
        return cost
    
    def gradient_descent(self, X, y, weights, bias):
        """
        Perform one step of gradient descent
        
        Parameters:
        X: Features (n_samples, n_features)
        y: Binary labels (n_samples,)
        weights: Current weights
        bias: Current bias
        
        Returns:
        weights: Updated weights
        bias: Updated bias
        """
        n_samples = X.shape[0]
        
        # Forward propagation
        z = np.dot(X, weights) + bias
        predictions = self.sigmoid(z)
        
        # Calculate gradients
        dw = (1/n_samples) * np.dot(X.T, (predictions - y))
        db = (1/n_samples) * np.sum(predictions - y)
        
        # Add regularization to weight gradient
        if self.regularization == 'l2':
            dw += (self.lambda_reg / n_samples) * weights
        elif self.regularization == 'l1':
            dw += (self.lambda_reg / n_samples) * np.sign(weights)
        
        # Update parameters
        weights = weights - self.learning_rate * dw
        bias = bias - self.learning_rate * db
        
        return weights, bias
    
    def fit_binary(self, X, y):
        """
        Train binary logistic regression
        
        Parameters:
        X: Features (n_samples, n_features)
        y: Binary labels (n_samples,)
        
        Returns:
        weights: Trained weights
        bias: Trained bias
        """
        n_samples, n_features = X.shape
        
        # Initialize parameters
        weights = np.zeros(n_features)
        bias = 0
        
        # Training loop
        for i in range(self.n_iterations):
            # Calculate cost
            cost = self.cost_function(X, y, weights, bias)
            
            # Gradient descent
            weights, bias = self.gradient_descent(X, y, weights, bias)
            
            # Store cost for plotting
            if i % 100 == 0:
                self.cost_history.append(cost)
                
        return weights, bias
    
    def fit(self, X_train, y_train):
        """
        Train multi-class logistic regression using One-vs-Rest
        
        Parameters:
        X_train: Training features (n_samples, n_features)
        y_train: Training labels (n_samples,)
        """
        self.classes = np.unique(y_train)
        n_classes = len(self.classes)
        
        print(f"Training {n_classes} binary classifiers (One-vs-Rest)...")
        
        # Train a binary classifier for each class
        for class_label in self.classes:
            print(f"  Training classifier for class {class_label}...", end="")
            
            # Create binary labels (1 for current class, 0 for others)
            binary_labels = (y_train == class_label).astype(int)
            
            # Train binary classifier
            weights, bias = self.fit_binary(X_train, binary_labels)
            
            # Store classifier
            self.classifiers[class_label] = {'weights': weights, 'bias': bias}
            print(" Done")
    
    def predict_proba(self, X_test):
        """
        Predict class probabilities
        
        Parameters:
        X_test: Test features (n_samples, n_features)
        
        Returns:
        probabilities: Array of class probabilities (n_samples, n_classes)
        """
        n_samples = X_test.shape[0]
        n_classes = len(self.classes)
        
        # Calculate scores for each class
        scores = np.zeros((n_samples, n_classes))
        
        for i, class_label in enumerate(self.classes):
            classifier = self.classifiers[class_label]
            z = np.dot(X_test, classifier['weights']) + classifier['bias']
            scores[:, i] = self.sigmoid(z)
        
        # Normalize scores to get probabilities (softmax-like)
        probabilities = scores / np.sum(scores, axis=1, keepdims=True)
        
        return probabilities
    
    def predict(self, X_test):
        """
        Predict classes
        
        Parameters:
        X_test: Test features (n_samples, n_features)
        
        Returns:
        predictions: Array of predicted classes
        """
        probabilities = self.predict_proba(X_test)
        
        # Choose class with highest probability
        predicted_indices = np.argmax(probabilities, axis=1)
        predictions = self.classes[predicted_indices]
        
        return predictions


class DataPreprocessor:
    """
    Handle data preprocessing tasks
    """
    def __init__(self):
        self.mean = None
        self.std = None
        
    def fit(self, X):
        """
        Calculate mean and standard deviation
        
        Parameters:
        X: Features to fit
        """
        self.mean = np.mean(X, axis=0)
        self.std = np.std(X, axis=0)
        # Prevent division by zero
        self.std[self.std == 0] = 1
        
    def transform(self, X):
        """
        Standardize features
        
        Parameters:
        X: Features to transform
        
        Returns:
        X_scaled: Standardized features
        """
        return (X - self.mean) / self.std
    
    def fit_transform(self, X):
        """
        Fit and transform in one step
        
        Parameters:
        X: Features to fit and transform
        
        Returns:
        X_scaled: Standardized features
        """
        self.fit(X)
        return self.transform(X)


def train_test_split(X, y, test_size=0.2, random_state=None):
    """
    Split data into training and testing sets
    
    Parameters:
    X: Features
    y: Labels
    test_size: Proportion of data for testing
    random_state: Random seed for reproducibility
    
    Returns:
    X_train, X_test, y_train, y_test: Split datasets
    """
    if random_state:
        np.random.seed(random_state)
    
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    
    # Create shuffled indices
    indices = np.random.permutation(n_samples)
    
    # Split indices
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    
    # Split data
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test


def accuracy_score(y_true, y_pred):
    """
    Calculate accuracy
    
    Parameters:
    y_true: True labels
    y_pred: Predicted labels
    
    Returns:
    accuracy: Proportion of correct predictions
    """
    return np.mean(y_true == y_pred)


def confusion_matrix(y_true, y_pred):
    """
    Create confusion matrix
    
    Parameters:
    y_true: True labels
    y_pred: Predicted labels
    
    Returns:
    matrix: Confusion matrix
    """
    classes = np.unique(np.concatenate([y_true, y_pred]))
    n_classes = len(classes)
    
    matrix = np.zeros((n_classes, n_classes), dtype=int)
    
    for i, true_class in enumerate(classes):
        for j, pred_class in enumerate(classes):
            matrix[i, j] = np.sum((y_true == true_class) & (y_pred == pred_class))
    
    return matrix


def classification_report(y_true, y_pred):
    """
    Generate classification report
    
    Parameters:
    y_true: True labels
    y_pred: Predicted labels
    
    Returns:
    report: Dictionary with metrics for each class
    """
    classes = np.unique(np.concatenate([y_true, y_pred]))
    report = {}
    
    for cls in classes:
        # True positives, false positives, false negatives
        tp = np.sum((y_true == cls) & (y_pred == cls))
        fp = np.sum((y_true != cls) & (y_pred == cls))
        fn = np.sum((y_true == cls) & (y_pred != cls))
        
        # Calculate metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        support = np.sum(y_true == cls)
        
        report[cls] = {
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'support': support
        }
    
    # Calculate weighted average
    total_support = len(y_true)
    avg_precision = sum(report[cls]['precision'] * report[cls]['support'] for cls in classes) / total_support
    avg_recall = sum(report[cls]['recall'] * report[cls]['support'] for cls in classes) / total_support
    avg_f1 = sum(report[cls]['f1-score'] * report[cls]['support'] for cls in classes) / total_support
    
    report['weighted avg'] = {
        'precision': avg_precision,
        'recall': avg_recall,
        'f1-score': avg_f1,
        'support': total_support
    }
    
    return report


def cross_validation(model, X, y, cv=5):
    """
    Perform k-fold cross-validation
    
    Parameters:
    model: Model to evaluate
    X: Features
    y: Labels
    cv: Number of folds
    
    Returns:
    scores: Array of accuracy scores for each fold
    """
    n_samples = X.shape[0]
    fold_size = n_samples // cv
    scores = []
    
    indices = np.random.permutation(n_samples)
    
    for fold in range(cv):
        # Define fold boundaries
        start = fold * fold_size
        end = start + fold_size if fold < cv - 1 else n_samples
        
        # Split data
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        
        X_train_fold = X[train_indices]
        y_train_fold = y[train_indices]
        X_test_fold = X[test_indices]
        y_test_fold = y[test_indices]
        
        # Train and evaluate
        # Create new model instance for each fold
        if isinstance(model, KNNFromScratch):
            fold_model = KNNFromScratch(k=model.k)
        else:
            fold_model = LogisticRegressionFromScratch(
                learning_rate=model.learning_rate,
                n_iterations=model.n_iterations,
                regularization=model.regularization,
                lambda_reg=model.lambda_reg
            )
        
        # Standardize data
        scaler = DataPreprocessor()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_test_scaled = scaler.transform(X_test_fold)
        
        # Train and predict
        fold_model.fit(X_train_scaled, y_train_fold)
        predictions = fold_model.predict(X_test_scaled)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test_fold, predictions)
        scores.append(accuracy)
        
    return np.array(scores)


def find_best_k(X_train, y_train, k_range):
    """
    Find the best k value for KNN using cross-validation
    
    Parameters:
    X_train: Training features
    y_train: Training labels
    k_range: Range of k values to test
    
    Returns:
    best_k: Optimal k value
    scores: Dictionary of k values and their scores
    """
    scores = {}
    
    for k in k_range:
        print(f"  Testing k={k}...", end="")
        knn = KNNFromScratch(k=k)
        cv_scores = cross_validation(knn, X_train, y_train, cv=5)
        scores[k] = {
            'mean': np.mean(cv_scores),
            'std': np.std(cv_scores)
        }
        print(f" Mean accuracy: {scores[k]['mean']:.4f}")
    
    # Find best k
    best_k = max(scores.keys(), key=lambda k: scores[k]['mean'])
    
    return best_k, scores


def plot_results(knn_results, lr_results, k_scores=None):
    """
    Create visualizations
    """
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Plot 1: K-value performance
    if k_scores:
        ax = axes[0, 0]
        k_values = list(k_scores.keys())
        mean_scores = [k_scores[k]['mean'] for k in k_values]
        std_scores = [k_scores[k]['std'] for k in k_values]
        
        ax.errorbar(k_values, mean_scores, yerr=std_scores, marker='o', capsize=5)
        ax.set_xlabel('K Value', fontsize=12)
        ax.set_ylabel('Cross-Validation Accuracy', fontsize=12)
        ax.set_title('KNN Performance vs K Value', fontsize=14, fontweight='bold')
        ax.grid(True, alpha=0.3)
    
    # Plot 2: Model Comparison
    ax = axes[0, 1]
    models = ['KNN', 'Logistic\nRegression']
    accuracies = [knn_results['accuracy'], lr_results['accuracy']]
    
    bars = ax.bar(models, accuracies, color=['skyblue', 'lightcoral'], edgecolor='black', linewidth=2)
    ax.set_ylabel('Test Accuracy', fontsize=12)
    ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels
    for bar, acc in zip(bars, accuracies):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
               f'{acc:.3f}', ha='center', va='bottom', fontsize=12, fontweight='bold')
    
    # Plot 3: Quality Distribution
    ax = axes[0, 2]
    quality_counts = df['quality'].value_counts().sort_index()
    ax.bar(quality_counts.index, quality_counts.values, color='green', alpha=0.7)
    ax.set_xlabel('Wine Quality', fontsize=12)
    ax.set_ylabel('Count', fontsize=12)
    ax.set_title('Wine Quality Distribution', fontsize=14, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    
    # Plot 4: KNN Confusion Matrix
    ax = axes[1, 0]
    sns.heatmap(knn_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel('Predicted', fontsize=12)
    ax.set_ylabel('Actual', fontsize=12)
    ax.set_title('KNN Confusion Matrix', fontsize=14, fontweight='bold')
    
    # Plot 5: Logistic Regression Confusion Matrix
    ax = axes[1, 1]
    sns.heatmap(lr_results['confusion_matrix'], annot=True, fmt='d', cmap='Reds', ax=ax)
    ax.set_xlabel('Predicted', fontsize=12)
    ax.set_ylabel('Actual', fontsize=12)
    ax.set_title('Logistic Regression Confusion Matrix', fontsize=14, fontweight='bold')
    
    # Plot 6: Per-Class Performance
    ax = axes[1, 2]
    classes = sorted(df['quality'].unique())
    
    knn_f1 = [knn_results['report'][cls]['f1-score'] for cls in classes]
    lr_f1 = [lr_results['report'][cls]['f1-score'] for cls in classes]
    
    x = np.arange(len(classes))
    width = 0.35
    
    ax.bar(x - width/2, knn_f1, width, label='KNN', color='skyblue')
    ax.bar(x + width/2, lr_f1, width, label='Logistic Regression', color='lightcoral')
    
    ax.set_xlabel('Wine Quality', fontsize=12)
    ax.set_ylabel('F1-Score', fontsize=12)
    ax.set_title('Per-Class F1-Score Comparison', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(classes)
    ax.legend()
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('wine_from_scratch_results.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✓ Results visualization saved as 'wine_from_scratch_results.png'")


In [18]:
def main():
    """
    Main execution function
    """
    print("\n" + "="*80)
    print(" " * 15 + "WINE QUALITY PREDICTION FROM SCRATCH")
    print(" " * 20 + "KNN & Logistic Regression")
    print("="*80)
    
    # Prepare data
    print("\n1. DATA PREPARATION")
    print("-" * 40)
    
    # Drop Id column if it exists
    if 'Id' in df.columns:
        df.drop('Id', axis=1, inplace=True)
    
    # Remove duplicates
    initial_size = len(df)
    df.drop_duplicates(inplace=True)
    print(f"Removed {initial_size - len(df)} duplicate rows")
    
    # Separate features and target
    X = df.drop('quality', axis=1).values
    y = df['quality'].values
    
    print(f"Dataset shape: {X.shape}")
    print(f"Number of classes: {len(np.unique(y))}")
    print(f"Class distribution: {dict(zip(*np.unique(y, return_counts=True)))}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set: {X_train.shape}")
    print(f"Test set: {X_test.shape}")
    
    # Standardize features
    scaler = DataPreprocessor()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Features standardized (mean=0, std=1)")
    
    # KNN Implementation
    print("\n2. K-NEAREST NEIGHBORS")
    print("-" * 40)
    
    # Find best k value
    print("Finding optimal k value using cross-validation...")
    k_range = range(3, 21, 2)
    best_k, k_scores = find_best_k(X_train_scaled, y_train, k_range)
    print(f"\n✓ Best k value: {best_k}")
    print(f"  Cross-validation accuracy: {k_scores[best_k]['mean']:.4f} (+/- {k_scores[best_k]['std']:.4f})")
    
    # Train final KNN model
    print("\nTraining final KNN model...")
    knn_model = KNNFromScratch(k=best_k)
    knn_model.fit(X_train_scaled, y_train)
    
    # Evaluate KNN
    knn_predictions = knn_model.predict(X_test_scaled)
    knn_accuracy = accuracy_score(y_test, knn_predictions)
    knn_cm = confusion_matrix(y_test, knn_predictions)
    knn_report = classification_report(y_test, knn_predictions)
    
    print(f"KNN Test Accuracy: {knn_accuracy:.4f}")
    print("\nKNN Classification Report:")
    print("-" * 40)
    for cls in sorted(knn_report.keys()):
        if cls != 'weighted avg':
            metrics = knn_report[cls]
            print(f"Class {cls}: Precision={metrics['precision']:.3f}, "
                  f"Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}, "
                  f"Support={metrics['support']}")
    
    # Logistic Regression Implementation
    print("\n3. LOGISTIC REGRESSION")
    print("-" * 40)
    
    # Train Logistic Regression
    print("Training Logistic Regression with gradient descent...")
    lr_model = LogisticRegressionFromScratch(
        learning_rate=0.1,
        n_iterations=1000,
        regularization='l2',
        lambda_reg=0.01
    )
    lr_model.fit(X_train_scaled, y_train)
    
    # Evaluate Logistic Regression
    lr_predictions = lr_model.predict(X_test_scaled)
    lr_accuracy = accuracy_score(y_test, lr_predictions)
    lr_cm = confusion_matrix(y_test, lr_predictions)
    lr_report = classification_report(y_test, lr_predictions)
    
    print(f"\nLogistic Regression Test Accuracy: {lr_accuracy:.4f}")
    print("\nLogistic Regression Classification Report:")
    print("-" * 40)
    for cls in sorted(lr_report.keys()):
        if cls != 'weighted avg':
            metrics = lr_report[cls]
            print(f"Class {cls}: Precision={metrics['precision']:.3f}, "
                  f"Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}, "
                  f"Support={metrics['support']}")
    
    # Model Comparison
    print("\n4. MODEL COMPARISON")
    print("-" * 40)
    print(f"KNN Accuracy: {knn_accuracy:.4f}")
    print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
    
    if knn_accuracy > lr_accuracy:
        print(f"\n✓ KNN performs better by {(knn_accuracy - lr_accuracy)*100:.2f}%")
    else:
        print(f"\n✓ Logistic Regression performs better by {(lr_accuracy - knn_accuracy)*100:.2f}%")
    
    # Weighted average metrics
    print("\nWeighted Average Metrics:")
    print(f"KNN - Precision: {knn_report['weighted avg']['precision']:.3f}, "
          f"Recall: {knn_report['weighted avg']['recall']:.3f}, "
          f"F1: {knn_report['weighted avg']['f1-score']:.3f}")
    print(f"LR  - Precision: {lr_report['weighted avg']['precision']:.3f}, "
          f"Recall: {lr_report['weighted avg']['recall']:.3f}, "
          f"F1: {lr_report['weighted avg']['f1-score']:.3f}")
    
    # Store results
    knn_results = {
        'accuracy': knn_accuracy,
        'confusion_matrix': knn_cm,
        'report': knn_report
    }
    
    lr_results = {
        'accuracy': lr_accuracy,
        'confusion_matrix': lr_cm,
        'report': lr_report
    }
    
    # Create visualizations
    print("\n5. CREATING VISUALIZATIONS")
    print("-" * 40)
    plot_results(knn_results, lr_results, k_scores)
    
    # Example prediction
    print("\n6. EXAMPLE PREDICTION")
    print("-" * 40)
    
    # Use first test sample
    sample_idx = 0
    sample_features = X_test_scaled[sample_idx:sample_idx+1]
    actual_quality = y_test[sample_idx]
    
    # Get predictions from both models
    knn_pred = knn_model.predict(sample_features)[0]
    lr_pred = lr_model.predict(sample_features)[0]
    
    # Get probabilities
    knn_proba = knn_model.predict_proba(sample_features)[0]
    lr_proba = lr_model.predict_proba(sample_features)[0]
    
    print(f"Sample wine features (standardized):")
    feature_names = df.drop('quality', axis=1).columns
    for name, value in zip(feature_names, X_test[sample_idx]):
        print(f"  {name}: {value:.3f}")
    
    print(f"\nActual Quality: {actual_quality}")
    print(f"KNN Prediction: {knn_pred}")
    print(f"Logistic Regression Prediction: {lr_pred}")
    
    print("\nKNN Probabilities:")
    for cls, prob in zip(np.unique(y_train), knn_proba):
        if prob > 0:
            print(f"  Quality {cls}: {prob:.3f}")
    
    print("\nLogistic Regression Probabilities:")
    for cls, prob in zip(lr_model.classes, lr_proba):
        print(f"  Quality {cls}: {prob:.3f}")
    
    print("\n" + "="*80)
    print("✓ Model training and evaluation complete!")
    print("="*80)
    
    return knn_model, lr_model, scaler


if __name__ == "__main__":
    knn_model, lr_model, scaler = main()


               WINE QUALITY PREDICTION FROM SCRATCH
                    KNN & Logistic Regression

1. DATA PREPARATION
----------------------------------------
Removed 125 duplicate rows
Dataset shape: (1018, 11)
Number of classes: 6
Class distribution: {np.int64(3): np.int64(6), np.int64(4): np.int64(33), np.int64(5): np.int64(433), np.int64(6): np.int64(409), np.int64(7): np.int64(122), np.int64(8): np.int64(15)}
Training set: (814, 11)
Test set: (204, 11)
Features standardized (mean=0, std=1)

2. K-NEAREST NEIGHBORS
----------------------------------------
Finding optimal k value using cross-validation...
  Testing k=3... Mean accuracy: 0.5248
  Testing k=5... Mean accuracy: 0.5506
  Testing k=7... Mean accuracy: 0.5651
  Testing k=9... Mean accuracy: 0.5762
  Testing k=11... Mean accuracy: 0.5934
  Testing k=13... Mean accuracy: 0.5884
  Testing k=15... Mean accuracy: 0.5945
  Testing k=17... Mean accuracy: 0.5614
  Testing k=19... Mean accuracy: 0.5763

✓ Best k value: 15
  Cross

AttributeError: 'str' object has no attribute 'keys'