
# SVM — How Maximizing the Margin Works (Visual Guide)

This notebook shows, step by step, how a **Support Vector Machine (SVM)** finds a boundary that **maximizes the margin** between classes.

**Features:**
1. A linear SVM on simple 2D data with the separating hyperplane, **margin lines**, and **support vectors**.
2. How to compute the **margin width** \(2/\|w\|\) and visualize distances via the decision function.
3. The effect of **C** (regularization) on the margin: small C → wider margin & more violations; large C → narrower margin & fewer violations.
4. **Hinge loss** intuition: why misclassified or within-margin points are penalized.
5. A **nonseparable** dataset to contrast small vs large C behavior.


In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

np.random.seed(0)


## Linear SVM on a simple 2D dataset


In [None]:
# Generate separable data
n = 200
X_pos = np.random.randn(n//2, 2) * 0.6 + np.array([2.5, 2.5])
X_neg = np.random.randn(n//2, 2) * 0.6 + np.array([-2.5, -2.0])
X = np.vstack([X_pos, X_neg])
y = np.hstack([np.ones(n//2), -np.ones(n//2)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

clf = make_pipeline(StandardScaler(), SVC(kernel="linear", C=1.0, random_state=0))
clf.fit(X_train, y_train)

print("Test accuracy:", round(accuracy_score(y_test, clf.predict(X_test)), 3))

svm = clf.named_steps["svc"]

x_min, x_max = X[:,0].min()-1.0, X[:,0].max()+1.0
y_min, y_max = X[:,1].min()-1.0, X[:,1].max()+1.0
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 400),
                     np.linspace(y_min, y_max, 400))
XY = np.c_[xx.ravel(), yy.ravel()]
Z = clf.decision_function(XY).reshape(xx.shape)

plt.figure(figsize=(6,5))
plt.contourf(xx, yy, Z>0, alpha=0.15)
plt.contour(xx, yy, Z, levels=[-1, 0, 1], linestyles=["--","-","--"])
plt.scatter(X_train[:,0], X_train[:,1], c=(y_train>0).astype(int), s=20, alpha=0.8, label="train")
plt.scatter(X_test[:,0],  X_test[:,1],  c=(y_test>0).astype(int),  s=20, alpha=0.6, marker="x", label="test")

sv = svm.support_vectors_
scaler = clf.named_steps["standardscaler"]
sv_orig = scaler.inverse_transform(sv)
plt.scatter(sv_orig[:,0], sv_orig[:,1], s=100, facecolors="none", edgecolors="k", label="support vectors")

plt.title("Linear SVM: separating hyperplane and margins (±1)")
plt.xlabel("x1"); plt.ylabel("x2"); plt.legend()
plt.tight_layout(); plt.show()


## Margin width \(2/\|w\|\) and distances


In [None]:
w = svm.coef_.ravel()
norm_w = np.linalg.norm(w)
margin = 2.0 / norm_w
print("||w|| =", round(norm_w, 4))
print("Margin width (2/||w||) =", round(margin, 4))

df_train = clf.decision_function(X_train)
df_test  = clf.decision_function(X_test)

plt.figure(figsize=(6,4))
plt.hist(df_train, bins=40, alpha=0.7, label="train")
plt.hist(df_test,  bins=40, alpha=0.7, label="test")
plt.axvline(0, linestyle="--")
plt.title("Decision function distribution")
plt.xlabel("decision function"); plt.ylabel("count"); plt.legend()
plt.tight_layout(); plt.show()


## Effect of **C** on margin, support vectors, and accuracy


In [None]:
Cs = [0.05, 0.2, 1, 5, 20]
margin_widths = []
n_support = []
accs = []

for C in Cs:
    model = make_pipeline(StandardScaler(), SVC(kernel="linear", C=C, random_state=0))
    model.fit(X_train, y_train)
    svm_i = model.named_steps["svc"]
    w_i = svm_i.coef_.ravel()
    m_i = 2.0 / (np.linalg.norm(w_i) + 1e-12)
    margin_widths.append(m_i)
    n_support.append(int(svm_i.n_support_.sum()))
    accs.append(accuracy_score(y_test, model.predict(X_test)))

plt.figure(figsize=(6,4))
plt.plot(Cs, margin_widths, marker="o")
plt.xscale("log"); plt.xlabel("C (log scale)"); plt.ylabel("Margin width (2/||w||)")
plt.title("Margin width vs C")
plt.tight_layout(); plt.show()

plt.figure(figsize=(6,4))
plt.plot(Cs, n_support, marker="o")
plt.xscale("log"); plt.xlabel("C (log scale)"); plt.ylabel("# support vectors")
plt.title("Support vectors vs C")
plt.tight_layout(); plt.show()

plt.figure(figsize=(6,4))
plt.plot(Cs, accs, marker="o")
plt.xscale("log"); plt.xlabel("C (log scale)"); plt.ylabel("Test accuracy")
plt.title("Accuracy vs C")
plt.tight_layout(); plt.show()


## Hinge loss intuition


In [None]:
z = np.linspace(-2.5, 2.5, 400)
hinge = np.maximum(0.0, 1.0 - z)

plt.figure(figsize=(6,4))
plt.plot(z, hinge)
plt.axvline(1.0, linestyle="--")
plt.title("Hinge loss  ℓ(z) = max(0, 1 - z)")
plt.xlabel("z = y · f(x)"); plt.ylabel("loss")
plt.tight_layout(); plt.show()


## Nonseparable data: small C vs large C


In [None]:
n2 = 300
A = np.random.randn(n2//2, 2) * 1.0 + np.array([1.0, 1.0])
B = np.random.randn(n2//2, 2) * 1.0 + np.array([-0.5, -0.5])
X2 = np.vstack([A, B]); y2 = np.hstack([np.ones(n2//2), -np.ones(n2//2)])

X2_tr, X2_te, y2_tr, y2_te = train_test_split(X2, y2, test_size=0.3, random_state=0, stratify=y2)

def plot_boundary(model, X, y, title):
    x_min, x_max = X[:,0].min()-1.0, X[:,0].max()+1.0
    y_min, y_max = X[:,1].min()-1.0, X[:,1].max()+1.0
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 400),
                         np.linspace(y_min, y_max, 400))
    XY = np.c_[xx.ravel(), yy.ravel()]
    Z = model.decision_function(XY).reshape(xx.shape)
    plt.figure(figsize=(6,5))
    plt.contourf(xx, yy, Z>0, alpha=0.15)
    plt.contour(xx, yy, Z, levels=[-1, 0, 1], linestyles=["--","-","--"])
    plt.scatter(X[:,0], X[:,1], c=(y>0).astype(int), s=18, alpha=0.8)
    svm_local = model.named_steps["svc"]
    sv_orig = model.named_steps["standardscaler"].inverse_transform(svm_local.support_vectors_)
    plt.scatter(sv_orig[:,0], sv_orig[:,1], s=80, facecolors="none", edgecolors="k")
    plt.title(title); plt.xlabel("x1"); plt.ylabel("x2")
    plt.tight_layout(); plt.show()

model_smallC = make_pipeline(StandardScaler(), SVC(kernel="linear", C=0.2, random_state=0)).fit(X2_tr, y2_tr)
plot_boundary(model_smallC, X2_tr, y2_tr, "Overlapping data — Small C (wider margin, more violations)")

model_largeC = make_pipeline(StandardScaler(), SVC(kernel="linear", C=20.0, random_state=0)).fit(X2_tr, y2_tr)
plot_boundary(model_largeC, X2_tr, y2_tr, "Overlapping data — Large C (narrow margin, fewer violations)")