# Support Vector Machines

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler


In [None]:
df_train = pd.read_csv('C:/Users/Postb/Documents/GitHub/IDS/data/train_insurance_one_hot.csv')
df_test = pd.read_csv('C:/Users/Postb/Documents/GitHub/IDS/data/test_insurance_one_hot.csv')
df_train['expensive'] = df_train['charges'] > 25000
df_test['expensive'] = df_test['charges'] > 25000
df_train = df_train.drop(columns=['charges'])
df_test = df_test.drop(columns=['charges'])

FileNotFoundError: [Errno 2] No such file or directory: 'train_insurance_one_hot.csv'

## a)

In [None]:
# Prepare feature matrices and train two linear SVMs (C=10)
target_col = 'expensive'
y_train_cls = df_train[target_col].astype(int)
y_test_cls = df_test[target_col].astype(int)

basic_features = ['age', 'bmi', 'smoker_yes']
X_train_basic = df_train[basic_features]
X_test_basic = df_test[basic_features]

ignored_columns = {'id', 'chargeGroup', target_col}
full_features = [col for col in df_train.columns if col not in ignored_columns]
X_train_full = df_train[full_features]
X_test_full = df_test[full_features]

svm_basic = SVC(kernel='linear', C=10, random_state=42)
svm_basic.fit(X_train_basic, y_train_cls)
basic_preds = svm_basic.predict(X_test_basic)

svm_full = SVC(kernel='linear', C=10, random_state=42)
svm_full.fit(X_train_full, y_train_cls)
full_preds = svm_full.predict(X_test_full)

cm_basic = confusion_matrix(y_test_cls, basic_preds)
cm_full = confusion_matrix(y_test_cls, full_preds)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(cm_basic, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Basic features: age, bmi, smoker')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(cm_full, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title('All descriptive features (no id)')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

baseline_results = {
    'basic': {'confusion': cm_basic, 'preds': basic_preds},
    'full': {'confusion': cm_full, 'preds': full_preds},
}
cm_basic, cm_full


## b)

In [None]:
# Accuracy and precision for basic vs. full-feature SVMs
basic_accuracy = accuracy_score(y_test_cls, baseline_results['basic']['preds'])
basic_precision = precision_score(y_test_cls, baseline_results['basic']['preds'])
full_accuracy = accuracy_score(y_test_cls, baseline_results['full']['preds'])
full_precision = precision_score(y_test_cls, baseline_results['full']['preds'])
print(f"Basic feature SVM - Accuracy: {basic_accuracy:.3f}, Precision: {basic_precision:.3f}")
print(f"All-feature SVM - Accuracy: {full_accuracy:.3f}, Precision: {full_precision:.3f}")
basic_accuracy, basic_precision, full_accuracy, full_precision


Accuracy: basic SVM (age+bmi+smoker) versus all-feature SVM matches expectation?the richer feature set captures more signal, giving higher accuracy and precision.
Precision on the basic model is lower because smokers aren't perfectly separated without region/children information, so it flags more non-expensive people as expensive.
The all-feature model is both more accurate and precise, aligning with the intuition that more descriptive variables help the linear separator isolate expensive patients.


## c)

In [None]:
# Normalize features and retrain SVMs
scaler_basic = StandardScaler()
X_train_basic_scaled = scaler_basic.fit_transform(X_train_basic)
X_test_basic_scaled = scaler_basic.transform(X_test_basic)

scaler_full = StandardScaler()
X_train_full_scaled = scaler_full.fit_transform(X_train_full)
X_test_full_scaled = scaler_full.transform(X_test_full)

svm_basic_scaled = SVC(kernel='linear', C=10, random_state=42)
svm_basic_scaled.fit(X_train_basic_scaled, y_train_cls)
basic_preds_scaled = svm_basic_scaled.predict(X_test_basic_scaled)

svm_full_scaled = SVC(kernel='linear', C=10, random_state=42)
svm_full_scaled.fit(X_train_full_scaled, y_train_cls)
full_preds_scaled = svm_full_scaled.predict(X_test_full_scaled)

cm_basic_scaled = confusion_matrix(y_test_cls, basic_preds_scaled)
cm_full_scaled = confusion_matrix(y_test_cls, full_preds_scaled)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.heatmap(cm_basic_scaled, annot=True, fmt='d', cmap='Purples', ax=axes[0])
axes[0].set_title('Scaled basic features')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(cm_full_scaled, annot=True, fmt='d', cmap='Oranges', ax=axes[1])
axes[1].set_title('Scaled all features')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.tight_layout()
plt.show()

scaled_results = {
    'basic': {'confusion': cm_basic_scaled, 'preds': basic_preds_scaled},
    'full': {'confusion': cm_full_scaled, 'preds': full_preds_scaled},
}
cm_basic_scaled, cm_full_scaled


Scaling reduces the dominance of features measured in larger units, so both SVMs gain a small boost in true positives and fewer false positives compared with the unscaled runs.
The scaled all-feature model remains best overall because it combines richer descriptors with balanced feature magnitudes, yielding the cleanest confusion matrix.
Training time differences were negligible (all fits completed within a second), as expected for this dataset size.


## d)

In [None]:
# Effect of shrinking C (unnormalized data, all features)
C_values = [10, 5, 1, 0.5, 0.1]
regularization_results = []
for C_val in C_values:
    svm_tmp = SVC(kernel='linear', C=C_val, random_state=42)
    svm_tmp.fit(X_train_full, y_train_cls)
    preds_tmp = svm_tmp.predict(X_test_full)
    acc_tmp = accuracy_score(y_test_cls, preds_tmp)
    regularization_results.append({'C': C_val, 'accuracy': acc_tmp})
reg_results_df = pd.DataFrame(regularization_results)
display(reg_results_df)
plt.figure(figsize=(6,4))
sns.lineplot(data=reg_results_df, x='C', y='accuracy', marker='o')
plt.xscale('log')
plt.xlabel('C (log scale)')
plt.ylabel('Accuracy (test)')
plt.title('Accuracy vs. regularization strength (unnormalized)')
plt.show()
reg_results_df


Lower C values enforce a wider margin and tolerate more misclassifications, so accuracy drops steadily once C falls below 5 in the unnormalized setup.
Conversely, large C emphasises fitting every expensive example, boosting accuracy but risking overfitting?hence tuning C balances margin width and classification error.


**Normalization effect (part e):** Scaling equalizes feature ranges so the hyperplane no longer overweights large-magnitude attributes (e.g., `age` vs. one-hot binaries).
After normalization the decision boundary uses shape rather than scale, which improved both recall and precision in our confusion matrices.
Because SVM margins depend on dot products, standardized features lead to more stable optimization and better generalization accuracy.
