In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy.stats import loguniform
import numpy as np

# Loading the Wine Quality dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=';')

# Basic preprocessing
X = data.drop('quality', axis=1)
y = data['quality'].apply(lambda x: 1 if x >= 6 else 0)  # Binary classification

print(f"Dataset loaded with {len(X)} samples and {X.shape[1]} features")

Dataset loaded with 1599 samples and 11 features


# Creating 10 Training-Test splits

In [3]:
samples = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=i  # Different random state for each sample
    )
    samples.append((X_train, X_test, y_train, y_test))
    print(f"Sample {i+1}: Train size={len(X_train)}, Test size={len(X_test)}")

Sample 1: Train size=1119, Test size=480
Sample 2: Train size=1119, Test size=480
Sample 3: Train size=1119, Test size=480
Sample 4: Train size=1119, Test size=480
Sample 5: Train size=1119, Test size=480
Sample 6: Train size=1119, Test size=480
Sample 7: Train size=1119, Test size=480
Sample 8: Train size=1119, Test size=480
Sample 9: Train size=1119, Test size=480
Sample 10: Train size=1119, Test size=480


# Optimized SVM Training Function

In [7]:
def optimize_svm(X_train, y_train, X_test, y_test, sample_num):
    param_dist = {
        'C': loguniform(1e-2, 1e2),  # Wider range for better optimization
        'gamma': loguniform(1e-4, 1e1),
        'kernel': ['rbf', 'linear']
    }

    svm = SVC()
    search = RandomizedSearchCV(
        svm,
        param_dist,
        n_iter=20,  # Optimal balance between speed and performance
        cv=3,
        n_jobs=-1,  # Use all CPU cores
        random_state=42
    )

    search.fit(X_train, y_train)

    # Get best model and predictions
    best_svm = search.best_estimator_
    y_pred = best_svm.predict(X_test)
    test_acc = accuracy_score(y_test, y_pred)

    return {
        'Sample #': sample_num,
        'Train Size': len(X_train),
        'Test Size': len(X_test),
        'Accuracy': test_acc,
        'Best C': search.best_params_['C'],
        'Best Gamma': search.best_params_['gamma'],
        'Best Kernel': search.best_params_['kernel'],
        'Search History': search.cv_results_['mean_test_score']
    }

# Running Optimization for all samples

In [8]:
results = []
for i, (X_train, X_test, y_train, y_test) in enumerate(samples, 51):
    print(f"\nProcessing Sample #{i}...")
    result = optimize_svm(X_train, y_train, X_test, y_test, i)
    results.append(result)
    print(f"Completed Sample #{i} with accuracy: {result['Accuracy']:.4f}")


Processing Sample #51...
Completed Sample #51 with accuracy: 0.7521

Processing Sample #52...
Completed Sample #52 with accuracy: 0.7396

Processing Sample #53...
Completed Sample #53 with accuracy: 0.7125

Processing Sample #54...
Completed Sample #54 with accuracy: 0.7417

Processing Sample #55...
Completed Sample #55 with accuracy: 0.7396

Processing Sample #56...
Completed Sample #56 with accuracy: 0.7521

Processing Sample #57...
Completed Sample #57 with accuracy: 0.7312

Processing Sample #58...
Completed Sample #58 with accuracy: 0.7417

Processing Sample #59...
Completed Sample #59 with accuracy: 0.7521

Processing Sample #60...
Completed Sample #60 with accuracy: 0.7104


In [9]:
# Create and save Table 1
table1 = pd.DataFrame(results)[['Sample #', 'Train Size', 'Test Size', 'Accuracy', 'Best C', 'Best Gamma', 'Best Kernel']]
print("\nTable 1. Comparative performance of Optimized-SVM")
print(table1.to_markdown(index=False))



Table 1. Comparative performance of Optimized-SVM
|   Sample # |   Train Size |   Test Size |   Accuracy |   Best C |   Best Gamma | Best Kernel   |
|-----------:|-------------:|------------:|-----------:|---------:|-------------:|:--------------|
|         51 |         1119 |         480 |   0.752083 | 13.1451  |  0.0964386   | linear        |
|         52 |         1119 |         480 |   0.739583 | 56.782   |  0.000100901 | linear        |
|         53 |         1119 |         480 |   0.7125   | 13.1451  |  0.0964386   | linear        |
|         54 |         1119 |         480 |   0.741667 | 56.782   |  0.000100901 | linear        |
|         55 |         1119 |         480 |   0.739583 | 78.5276  |  0.00145833  | linear        |
|         56 |         1119 |         480 |   0.752083 | 56.782   |  0.000100901 | linear        |
|         57 |         1119 |         480 |   0.73125  |  2.80164 |  0.000498275 | linear        |
|         58 |         1119 |         480 |   0.741667 | 7

In [11]:
plt.figure(figsize=(10, 4))
plt.axis('off')
plt.table(cellText=table1.values, colLabels=table1.columns, loc='center')
plt.savefig('results_table.png', bbox_inches='tight', dpi=300)
plt.close()

In [12]:
# Plot convergence for best sample
best_sample = max(results, key=lambda x: x['Accuracy'])
plt.figure(figsize=(10,6))
plt.plot(best_sample['Search History'], 'o-')
plt.title(f"Convergence Graph (Best Sample #{best_sample['Sample #']}, Accuracy={best_sample['Accuracy']:.4f})")
plt.xlabel("Iteration")
plt.ylabel("Accuracy")
plt.grid(True)
plt.savefig('svm_convergence.png')
plt.close()

print("\nAll results saved successfully!")


All results saved successfully!
