## ________________ **Assignment-5** ________________


In [3]:
# Standard imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import pandas as pd

## Q1 Data loading and flattening 
a) Load the digits dataset and print the shapes of images, data, and target.    
b) Flatten the images: reshape (n_samples, 8, 8) into (n_samples, 64) manually. Verify that this 
matches digits.data.    
c) Display 6 sample images with their labels. 

In [None]:
# Load digits
digits = datasets.load_digits()
images = digits.images          # (n_samples, 8, 8)
data_builtin = digits.data      # already flattened (n_samples, 64)
target = digits.target

print("images.shape:", images.shape)
print("data.shape:", data_builtin.shape)
print("target.shape:", target.shape)

# Manual flattening
n_samples = images.shape[0]
data_manual = images.reshape((n_samples, -1))
print("data_manual.shape:", data_manual.shape)
print("Does manual flattening match digits.data? ->", np.allclose(data_manual, data_builtin))

# Display 6 samples
fig, axes = plt.subplots(1,6, figsize=(10,3))
for i, ax in enumerate(axes):
    ax.imshow(images[i], cmap='gray', interpolation='nearest')
    ax.set_title(f"Label: {target[i]}")
    ax.axis('off')
plt.tight_layout()
plt.show()

## Q2. Train/test split and scaling
a) Split the flattened data into train (80%) and test (20%) with random_state=42.   
b) Standardize features using StandardScaler. Why is scaling important?  

In [None]:
# Q2a: Split
X_train, X_test, y_train, y_test = train_test_split(data_manual, target, test_size=0.2, random_state=42, stratify=target)
print("X_train.shape:", X_train.shape)
print("X_test.shape:", X_test.shape)

# Q2b: Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Mean of scaled train features (approx):", np.mean(X_train_scaled, axis=0)[:5])
print("Std of scaled train features (approx):", np.std(X_train_scaled, axis=0)[:5])

# Short explanation (display as markdown)
from IPython.display import Markdown, display
display(Markdown("""**Why scaling?**

"
                "Scaling ensures features are centered and have unit variance. Logistic regression (and many ML algorithms) "
                "converge faster and behave better when features are on similar scales; regularization penalties also act uniformly."""))

## Q3. Train OvR logistic regression 
Train logistic regression with multi_class='ovr'. Report training and testing accuracy. Use 
max_iter=1000 and random_state=42. 

In [None]:
# Train OvR logistic regression
model_ovr = LogisticRegression(multi_class='ovr', max_iter=1000, random_state=42)
model_ovr.fit(X_train_scaled, y_train)

y_train_pred = model_ovr.predict(X_train_scaled)
y_test_pred = model_ovr.predict(X_test_scaled)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train accuracy: {train_acc:.4f}")
print(f"Test accuracy:  {test_acc:.4f}")

## Q4. Class probabilities for one sample (sklearn) 
a) Pick one test sample. Use model.predict_proba to print the probability vector of length 10.  
b) State which class has the highest probability and compare with the true label. 

In [None]:
# Pick one test sample (use the first test sample)
idx = 0
x_sample = X_test_scaled[idx].reshape(1,-1)
y_true = y_test[idx]
proba = model_ovr.predict_proba(x_sample).flatten()
print("Probability vector (length 10):")
print(np.round(proba, 4))
pred_class = np.argmax(proba)
print("\nPredicted class (highest prob):", pred_class)
print("True class:", y_true)


## Q5. Confusion matrix & misclassifications 
a) Compute and plot the confusion matrix on the test set.   
b) List the 3 most confused digit pairs (true → predicted).

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=digits.target_names)
fig, ax = plt.subplots(figsize=(8,8))
disp.plot(ax=ax, xticks_rotation='vertical')
plt.title("Confusion Matrix (Test Set)")
plt.show()

# Find the 3 most confused pairs (excluding diagonal)
cm_no_diag = cm.copy()
np.fill_diagonal(cm_no_diag, 0)
pairs = []
for true_label in range(cm_no_diag.shape[0]):
    for pred_label in range(cm_no_diag.shape[1]):
        count = cm_no_diag[true_label, pred_label]
        if count > 0:
            pairs.append(((true_label, pred_label), count))
pairs_sorted = sorted(pairs, key=lambda x: x[1], reverse=True)
print("Top 3 most confused pairs (true -> predicted) and counts:")
for i in range(min(3, len(pairs_sorted))):
    print(f"{pairs_sorted[i][0][0]} -> {pairs_sorted[i][0][1]} : {pairs_sorted[i][1]}")

## Q6 Regularization sensitivity 
Train with C in {0.01, 0.1, 1, 10}. Plot test accuracy vs C and comment on the effect.

In [None]:
Cs = [0.01, 0.1, 1, 10]
test_accs = []
for C in Cs:
    m = LogisticRegression(C=C, multi_class='ovr', max_iter=1000, random_state=42)
    m.fit(X_train_scaled, y_train)
    test_accs.append(accuracy_score(y_test, m.predict(X_test_scaled)))

# Plot
plt.figure(figsize=(6,4))
plt.plot(Cs, test_accs, marker='o')
plt.xscale('log')
plt.xlabel('C (log scale)')
plt.ylabel('Test accuracy')
plt.title('Test accuracy vs C (OvR Logistic Regression)')
plt.grid(True)
plt.show()

for C, acc in zip(Cs, test_accs):
    print(f"C={C}: test accuracy = {acc:.4f}")

## Q7 Effect of removing regularization 
a) Train three models:  
• Default (C=1, L2 regularization)  
• Very large C=1e6 (weak regularization, almost none)   
• penalty=None (no regularization at all, solver=lbfgs)     
b) Compare test accuracies.

In [None]:
# Default
m_default = LogisticRegression(C=1, penalty='l2', solver='lbfgs', multi_class='ovr', max_iter=1000, random_state=42)
m_default.fit(X_train_scaled, y_train)
acc_default = accuracy_score(y_test, m_default.predict(X_test_scaled))

# Very large C
m_largeC = LogisticRegression(C=1e6, penalty='l2', solver='lbfgs', multi_class='ovr', max_iter=1000, random_state=42)
m_largeC.fit(X_train_scaled, y_train)
acc_largeC = accuracy_score(y_test, m_largeC.predict(X_test_scaled))

# No regularization
try:
    m_none = LogisticRegression(penalty='none', solver='lbfgs', multi_class='ovr', max_iter=1000, random_state=42)
    m_none.fit(X_train_scaled, y_train)
    acc_none = accuracy_score(y_test, m_none.predict(X_test_scaled))
except Exception as e:
    acc_none = None
    print("Could not train penalty='none' with lbfgs in this sklearn version:", e)

print(f"Default (C=1, L2) test acc: {acc_default:.4f}")
print(f"Very large C=1e6 test acc:       {acc_largeC:.4f}")
print(f"No regularization (penalty='none') test acc: {acc_none}")

In [None]:
# Use the previously trained OvR model (model_ovr)
W = model_ovr.coef_      # shape (n_classes, n_features)
b = model_ovr.intercept_ # shape (n_classes,)

# Choose the same sample (index idx)
x = X_test_scaled[idx]   # 1D array
z = W.dot(x) + b         # raw scores for each of the 10 binary classifiers
sigmoid = lambda t: 1.0 / (1.0 + np.exp(-t))
pks = sigmoid(z)
pks_norm = pks / np.sum(pks)

print("Raw scores z_k:")
print(np.round(z, 4))
print("\nSigmoid p_k (OvR outputs, not summing to 1 necessarily):")
print(np.round(pks, 6))
print("\nNormalized probabilities (sum to 1):")
print(np.round(pks_norm, 6))
print("\nSklearn predict_proba vector:")
print(np.round(model_ovr.predict_proba(x.reshape(1,-1)).flatten(), 6))

print("\nSums -> Sigmoid sum:", np.sum(pks), "Normalized sum:", np.sum(pks_norm))