<a href="https://colab.research.google.com/github/KHUSHIHN/AIML_lab/blob/main/problem_5%2C6%2C7%2C8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# iris_simple_logreg_svm.py
# Simple Logistic Regression & Linear SVM on Iris dataset
# Includes: Standardization, 5-fold CV, Confusion Matrix, Classification Report

import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

# Make output folder
def make_output_dir():
    os.makedirs("outputs/iris", exist_ok=True)

def main():
    make_output_dir()

    # Load Iris dataset
    data = load_iris()
    X = data.data
    y = data.target
    labels = data.target_names

    # Define models inside a pipeline (Scaler + Classifier)
    logistic_model = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(max_iter=1000))
    ])

    svm_model = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LinearSVC())
    ])

    # 5-fold cross validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Train + Evaluate both models
    for name, model in [("Logistic Regression", logistic_model),
                        ("Linear SVM", svm_model)]:

        # Accuracy scores for 5 folds
        scores = cross_val_score(model, X, y, cv=cv, scoring="accuracy")
        print(f"{name} - 5 Fold Accuracy = {scores.mean():.3f} (+/- {scores.std():.3f})")

        # Predictions using cross-validation
        y_pred = cross_val_predict(model, X, y, cv=cv)

        # Confusion Matrix
        cm = confusion_matrix(y, y_pred)
        print(f"\n{name} - Confusion Matrix:")
        print(cm)

        # Classification report (precision, recall, f1)
        print(f"\n{name} - Classification Report:")
        print(classification_report(y, y_pred, target_names=labels))

        # Save confusion matrix plot
        disp = ConfusionMatrixDisplay(cm, display_labels=labels)
        disp.plot(cmap="Blues")
        plt.title(f"{name} Confusion Matrix")
        out_path = f"outputs/iris/cm_{name.replace(' ', '_').lower()}.png"
        plt.savefig(out_path)
        plt.close()
        print(f"Saved image: {out_path}")

if __name__ == "__main__":
    main()


Logistic Regression - 5 Fold Accuracy = 0.953 (+/- 0.045)

Logistic Regression - Confusion Matrix:
[[50  0  0]
 [ 0 47  3]
 [ 0  4 46]]

Logistic Regression - Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        50
  versicolor       0.92      0.94      0.93        50
   virginica       0.94      0.92      0.93        50

    accuracy                           0.95       150
   macro avg       0.95      0.95      0.95       150
weighted avg       0.95      0.95      0.95       150

Saved image: outputs/iris/cm_logistic_regression.png
Linear SVM - 5 Fold Accuracy = 0.927 (+/- 0.053)

Linear SVM - Confusion Matrix:
[[49  1  0]
 [ 0 44  6]
 [ 0  4 46]]

Linear SVM - Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      0.98      0.99        50
  versicolor       0.90      0.88      0.89        50
   virginica       0.88      0.92      0.90        50

    accurac

In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

URL = "https://raw.githubusercontent.com/justmarkham/scikit-learn-videos/master/data/sms.tsv"

def load_sms_data():
    try:
        print("Trying to download SMS dataset...")
        df = pd.read_csv(URL, sep="\t", header=None, names=["label", "message"])
        print("Download successful!")
        return df
    except:
        print("\nCould NOT download.")
        print("Please upload SMSSpamCollection file.")
        from google.colab import files
        uploaded = files.upload()
        fname = list(uploaded.keys())[0]
        df = pd.read_csv(fname, sep="\t", header=None, names=["label", "message"])
        return df

def main():
    os.makedirs("outputs/sms", exist_ok=True)

    df = load_sms_data()

    # --------------------------
    # ⚠ FIXED: Proper parentheses
    # --------------------------
    X_train, X_test, y_train, y_test = train_test_split(
        df["message"],
        df["label"],
        test_size=0.2,
        random_state=42,
        stratify=df["label"]
    )   # ← This ) was missing in your code!

    vectorizer = TfidfVectorizer(stop_words="english")
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predi_


In [None]:
# mnist_mlp_vs_logistic.py
# Keras MLP (dropout + early stopping) on MNIST, compared to a Logistic baseline.
# Saves confusion matrix for the MLP.

import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping


def ensure_out():
    os.makedirs("outputs/mnist", exist_ok=True)


def main():

    ensure_out()

    # Load MNIST dataset
    (X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

    # ----- Logistic Regression baseline -----
    X_train_flat = X_train.reshape((X_train.shape[0], -1)).astype("float32") / 255.0
    X_test_flat = X_test.reshape((X_test.shape[0], -1)).astype("float32") / 255.0

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_flat)
    X_test_scaled = scaler.transform(X_test_flat)

    logreg = LogisticRegression(
        solver="saga",
        multi_class="multinomial",
        max_iter=200,
        n_jobs=-1
    )

    logreg.fit(X_train_scaled, y_train)
    y_pred_lr = logreg.predict(X_test_scaled)
    acc_lr = accuracy_score(y_test, y_pred_lr)
    print(f"Logistic baseline test accuracy: {acc_lr:.4f}")

    # ----- MLP with dropout + early stopping -----
    X_train_n = X_train.astype("float32") / 255.0
    X_test_n = X_test.astype("float32") / 255.0

    model = Sequential([
        Flatten(input_shape=(28, 28)),
        Dense(512, activation="relu"),
        Dropout(0.3),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(10, activation="softmax")
    ])

    model.compile(
        optimizer="adam",
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    es = EarlyStopping(
        monitor="val_accuracy",
        patience=3,
        restore_best_weights=True
    )

    model.fit(
        X_train_n, y_train,
        validation_split=0.1,
        epochs=30,
        batch_size=128,
        callbacks=[es],
        verbose=0
    )

    test_loss, test_acc = model.evaluate(X_test_n, y_test, verbose=0)
    print(f"Keras MLP test accuracy: {test_acc:.4f}")

    # Confusion matrix for MLP
    y_pred_mlp = np.argmax(model.predict(X_test_n, verbose=0), axis=1)
    cm = confusion_matrix(y_test, y_pred_mlp)

    disp = ConfusionMatrixDisplay(cm, display_labels=np.arange(10))
    disp.plot(cmap="Blues", xticks_rotation=45)

    plt.title("MNIST Confusion Matrix - Keras MLP")
    plt.tight_layout()
    plt.savefig("outputs/mnist/cm_mlp.png", dpi=200)
    plt.close()

    print("Saved: outputs/mnist/cm_mlp.png")


if __name__ == "__main__":
    main()


In [9]:
# svm_kernels_margins_digits.py
# Very beginner-friendly version!
# This script:
# 1. Loads the Digits dataset
# 2. Trains Linear SVM and RBF SVM
# 3. Performs Grid Search for best C and gamma
# 4. Saves a heatmap of accuracies
# 5. Shows support vectors in PCA 2D

import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_digits
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
    train_test_split
)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA


# Create output folder if missing
def ensure_output_folder():
    os.makedirs("outputs/svm", exist_ok=True)


# Draw heatmap of accuracy values
def draw_heatmap(acc_matrix, C_values, gamma_values, title, save_path):
    plt.figure()
    plt.imshow(acc_matrix, origin="lower", aspect="auto")
    plt.colorbar(label="CV Accuracy")
    plt.xticks(np.arange(len(gamma_values)), gamma_values)
    plt.yticks(np.arange(len(C_values)), C_values)
    plt.xlabel("Gamma value")
    plt.ylabel("C value")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.close()


# Plot support vectors after PCA reduction
def plot_support_vectors(model, X_2d, y, title, save_path):
    plt.figure()

    # Plot each digit class
    for digit in np.unique(y):
        mask = (y == digit)
        plt.scatter(X_2d[mask, 0], X_2d[mask, 1], s=12, edgecolors='k',
                    alpha=0.6, label=str(digit))

    # Highlight support vectors
    sv = model.support_vectors_
    plt.scatter(sv[:, 0], sv[:, 1], s=70, facecolors='none',
                edgecolors='red', linewidths=1.5, label="Support Vectors")

    plt.title(title)
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend(fontsize=7, ncol=2)
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.close()


def main():

    ensure_output_folder()

    # ===============================
    # 1. Load Digits dataset
    # ===============================
    X, y = load_digits(return_X_y=True)

    # Scale the data (important for SVM!)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Cross-validation setup
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # ===============================
    # 2. Baseline: Linear SVM
    # ===============================
    linear_svm = SVC(kernel="linear", C=1.0)
    linear_scores = cross_val_score(linear_svm, X_scaled, y, cv=cv, scoring="accuracy")

    # ===============================
    # 3. Baseline: RBF SVM
    # ===============================
    rbf_svm = SVC(kernel="rbf", C=1.0, gamma="scale")
    rbf_scores = cross_val_score(rbf_svm, X_scaled, y, cv=cv, scoring="accuracy")

    print(f"\nLinear SVM accuracy: {linear_scores.mean():.4f} ± {linear_scores.std():.4f}")
    print(f"RBF SVM accuracy:    {rbf_scores.mean():.4f} ± {rbf_scores.std():.4f}")

    # ===============================
    # 4. Grid Search (C and gamma)
    # ===============================
    param_grid = {
        "C": [0.1, 1, 10, 100],
        "gamma": [0.001, 0.01, 0.1, 1.0]
    }

    print("\nRunning Grid Search... (this may take a few seconds)")
    grid_search = GridSearchCV(
        SVC(kernel="rbf"),
        param_grid=param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1
    )
    grid_search.fit(X_scaled, y)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV accuracy: {grid_search.best_score_:.4f}")

    # Convert results to matrix for heatmap
    mean_scores = grid_search.cv_results_["mean_test_score"]
    score_matrix = mean_scores.reshape(len(param_grid["C"]), len(param_grid["gamma"]))

    draw_heatmap(
        score_matrix,
        param_grid["C"],
        param_grid["gamma"],
        "RBF SVM Grid Search Accuracy",
        "outputs/svm/heatmap_rbf.png"
    )

    print("\nSaved: outputs/svm/heatmap_rbf.png")

    # ===============================
    # 5. PCA + Support Vector Plots
    # ===============================
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.25, stratify=y, random_state=42
    )

    pca = PCA(n_components=2, random_state=42)
    X_train_2d = pca.fit_transform(X_train)

    # Train best models on PCA data
    best_linear = SVC(kernel="linear", C=1.0).fit(X_train_2d, y_train)
    best_rbf = SVC(
        kernel="rbf",
        C=grid_search.best_params_["C"],
        gamma=grid_search.best_params_["gamma"]
    ).fit(X_train_2d, y_train)

    plot_support_vectors(
        best_linear, X_train_2d, y_train,
        "Linear SVM Support Vectors (PCA 2D)",
        "outputs/svm/sv_linear.png"
    )

    plot_support_vectors(
        best_rbf, X_train_2d, y_train,
        "RBF SVM Support Vectors (PCA 2D)",
        "outputs/svm/sv_rbf.png"
    )

    print("Saved: outputs/svm/sv_linear.png, outputs/svm/sv_rbf.png")


if __name__ == "__main__":
    main()



Linear SVM accuracy: 0.9794 ± 0.0033
RBF SVM accuracy:    0.9839 ± 0.0060

Running Grid Search... (this may take a few seconds)
Best parameters: {'C': 10, 'gamma': 0.01}
Best CV accuracy: 0.9827

Saved: outputs/svm/heatmap_rbf.png
Saved: outputs/svm/sv_linear.png, outputs/svm/sv_rbf.png
