In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

import pandas as pd

# 1. Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# 2. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Scale features (RẤT quan trọng với SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
svm_clf = SVC(
    kernel="rbf",
    C=1.0,
    gamma="scale",
    probability=False,   # True nếu cần predict_proba
    random_state=42
)

svm_clf.fit(X_train_scaled, y_train)
y_pred_svm = svm_clf.predict(X_test_scaled)

acc = accuracy_score(y_test, y_pred_svm)
f1 = f1_score(y_test, y_pred_svm, average="weighted")
print("SVM (RBF) Accuracy:", acc)
print("SVM (RBF) F1-score:", f1)
print(classification_report(y_test, y_pred_svm, target_names=data.target_names))


SVM (RBF) Accuracy: 0.9824561403508771
SVM (RBF) F1-score: 0.9824561403508771
              precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        42
      benign       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



In [5]:
#Thử các kernel & C
configs = [
    ("linear", 0.1),
    ("linear", 1.0),
    ("linear", 10.0),
    ("rbf", 0.1),
    ("rbf", 1.0),
    ("rbf", 10.0),
]

results = []
for kernel, C in configs:
    model = SVC(kernel=kernel, C=C, gamma="scale", random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    results.append((kernel, C, acc, f1))

results_df = pd.DataFrame(results, columns=["kernel", "C", "accuracy", "f1"])
print(results_df)


   kernel     C  accuracy        f1
0  linear   0.1  0.982456  0.982456
1  linear   1.0  0.973684  0.973747
2  linear  10.0  0.982456  0.982456
3     rbf   0.1  0.947368  0.947087
4     rbf   1.0  0.982456  0.982456
5     rbf  10.0  0.973684  0.973747


In [6]:
svm_poly = SVC(kernel="poly", degree=3, C=1.0, gamma="scale", random_state=42)
svm_poly.fit(X_train_scaled, y_train)
y_pred_poly = svm_poly.predict(X_test_scaled)
print("SVM (poly) Accuracy:", accuracy_score(y_test, y_pred_poly))


SVM (poly) Accuracy: 0.9122807017543859


So sánh SVM với Logistic & Random Forest

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

log_clf = LogisticRegression(max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)
xgb_clf = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

models = {
    "Logistic": log_clf,
    "RandomForest": rf_clf,
    "XGBoost": xgb_clf,
    "SVM_RBF": svm_clf
}

results = []
for name, model in models.items():
    # dùng dữ liệu scaled cho model nhạy scale, RF/XGB vẫn chấp nhận được
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    results.append((name, acc, f1))

results_df = pd.DataFrame(results, columns=["model", "accuracy", "f1"])
print(results_df)


          model  accuracy        f1
0      Logistic  0.982456  0.982456
1  RandomForest  0.956140  0.956027
2       XGBoost  0.956140  0.955776
3       SVM_RBF  0.982456  0.982456
