In [8]:
"""
Assignment-1: Wine Quality dataset (White Wine)
Algorithm: Support Vector Machine (SVM)

Usage:
  python assignment1.py

Outputs:
 - Prints metrics to stdout
 - Saves confusion matrix plot and PCA visualization to ./outputs/
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA

# --- Config ---
DATA_URL_WHITE = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

# 1. Load dataset
print("Downloading dataset from:", DATA_URL_WHITE)
df = pd.read_csv(DATA_URL_WHITE, sep=';')
print("Dataset shape:", df.shape)
print(df.head())

# Features and labels
X = df.drop(columns=['quality'])
y = (df['quality'] >= 7).astype(int)   # binary classification (good wine = 1)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# Standardize
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# 2. Train SVM
svm_clf = SVC(kernel='rbf', gamma='scale')
svm_clf.fit(X_train_s, y_train)
y_pred = svm_clf.predict(X_test_s)

# 3. Metrics
print("\n--- SVM (RBF kernel) ---")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, zero_division=0))
print("Recall:", recall_score(y_test, y_pred, zero_division=0))
print("F1 Score:", f1_score(y_test, y_pred, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 4. Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(4,4))
im = ax.imshow(cm, cmap="Blues")
ax.set_title("SVM Confusion Matrix")
plt.colorbar(im)
ax.set_xticks([0,1])
ax.set_yticks([0,1])
ax.set_xticklabels(["Not Good", "Good"])
ax.set_yticklabels(["Not Good", "Good"])
plt.xlabel("Predicted")
plt.ylabel("True")

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, cm[i, j], ha="center", va="center", color="red")

plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "svm_confusion_matrix.png"))
plt.close()

# 5. PCA visualization (2D scatter)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(scaler.transform(X))

plt.figure(figsize=(7,5))
plt.scatter(X_pca[:,0], X_pca[:,1], c=y, cmap='viridis', alpha=0.6)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA (2 components) colored by wine quality >= 7")
plt.colorbar(label="Good wine (1=yes)")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "svm_pca_visualization.png"))
plt.close()

print("\nSaved outputs to", OUT_DIR)


Downloading dataset from: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
Dataset shape: (4898, 12)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.0              0.27         0.36            20.7      0.045   
1            6.3              0.30         0.34             1.6      0.049   
2            8.1              0.28         0.40             6.9      0.050   
3            7.2              0.23         0.32             8.5      0.058   
4            7.2              0.23         0.32             8.5      0.058   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 45.0                 170.0   1.0010  3.00       0.45   
1                 14.0                 132.0   0.9940  3.30       0.49   
2                 30.0                  97.0   0.9951  3.26       0.44   
3                 47.0                 186.0   0.9956  3.19       0.40   
4               

In [10]:
import seaborn as sns
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# --- Final Extra Plots with Proper Labels ---

# 1. Wine quality distribution
plt.figure(figsize=(7,5))
sns.countplot(x=df['quality'], palette="viridis")
plt.title("Distribution of Wine Quality Scores", fontsize=14, fontweight="bold")
plt.xlabel("Wine Quality Score", fontsize=12)
plt.ylabel("Number of Samples", fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "wine_quality_distribution.png"))
plt.close()

# 2. Feature correlation heatmap
plt.figure(figsize=(12,9))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", center=0, cbar_kws={"shrink": 0.8})
plt.title("Feature Correlation Heatmap (including Quality)", fontsize=14, fontweight="bold")
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "correlation_heatmap.png"))
plt.close()

# 3. ROC Curve for SVM
y_scores = svm_clf.decision_function(X_test_s)
fpr, tpr, _ = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, color="blue", lw=2, label=f"AUC = {roc_auc:.2f}")
plt.plot([0,1], [0,1], linestyle="--", color="gray", label="Random Guess")
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.title("SVM ROC Curve", fontsize=14, fontweight="bold")
plt.legend(loc="lower right", fontsize=11)
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "svm_roc_curve.png"))
plt.close()

# 4. Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_test, y_scores)
plt.figure(figsize=(7,6))
plt.plot(rec, prec, color="green", lw=2)
plt.xlabel("Recall", fontsize=12)
plt.ylabel("Precision", fontsize=12)
plt.title("SVM Precision-Recall Curve", fontsize=14, fontweight="bold")
plt.grid(True, linestyle="--", alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(OUT_DIR, "svm_precision_recall_curve.png"))
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.countplot(x=df['quality'], palette="viridis")
