In [19]:
# 17-04-2024
# CSC354 – Assignment3 – ML – Support Vector Machines
# Hamna Shahbaz
# FA21-BSE-048
# Using SVM to classify datapoints and evaluating best models

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC

Q1.
1. It is not possible to properly fit the dataset into two different classes without any label for either class.
We cannot determine which datapoint belongs to which class wihout any labelled classification. Thus, it is not possible to fit the model onto an unlablled dataset and distinguish its negative and positive instances.
The following code would fit the model and divide the datapoints in different points if there were labels for 'X' and 'Y' columns.

In [None]:
#Question 1.1
dataset = pd.read_csv("dataset-q-1.csv")

X = dataset.drop(columns=['label'])
y = dataset['label']

svm = SVC(kernel='linear')
svm.fit(X, y)


Q1.
2. It is possible to somewhat spot outliers by using a scatter plot and a decision boundary by distinguishing the points that are in far away corners of the plot. The decision boundary will just be there to separate the area of the model and would not actually have any signifcant meaning as there will be no classification task. The given code would plot the scatter plot across a boundary in an unnamed, dummy lablled column.

In [None]:
#Question 1.2
dataset = pd.read_csv("dataset-q-1.csv")

X = dataset.drop(columns=['Unnamed: 0'])
y_dummy = np.zeros(len(X))
svm = SVC(kernel='linear')
svm.fit(X, y_dummy)

# Function to plot decision boundary
def plot_decision_boundary(X, model):
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], color='blue', label='Data Points')

    ax = plt.gca()
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()

    xx = np.linspace(xlim[0], xlim[1], 30)
    yy = np.linspace(ylim[0], ylim[1], 30)
    YY, XX = np.meshgrid(yy, xx)
    xy = np.vstack([XX.ravel(), YY.ravel()]).T
    Z = model.decision_function(xy).reshape(XX.shape)

    ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--'])
    plt.xlabel(X.columns[0])
    plt.ylabel(X.columns[1])
    plt.title('Decision Boundary and Data Points')
    plt.legend()
    plt.show()

plot_decision_boundary(X, svm)


In [13]:
#Question 03
#Loading the dataset and fitting it onto the model

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()
# Use the first two features
X = iris.data[:, :2]
y = iris.target

# Split the dataset into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Polynomial kernel (degree = 2)
svm_poly = SVC(kernel='poly', degree=2)
svm_poly.fit(X_train, y_train)

# Gaussian kernel (sigma = 1)
svm_gaussian = SVC(kernel='rbf', gamma=1)
svm_gaussian.fit(X_train, y_train)

# Evaluate the performance of each model
poly_accuracy = accuracy_score(y_test, svm_poly.predict(X_test))
gaussian_accuracy = accuracy_score(y_test, svm_gaussian.predict(X_test))

print("Accuracy with polynomial kernel (degree = 2):", poly_accuracy)
print("Accuracy with Gaussian kernel (sigma = 1):", gaussian_accuracy)


Accuracy with polynomial kernel (degree = 2): 0.8333333333333334
Accuracy with Gaussian kernel (sigma = 1): 0.9


In [None]:
#Question 03
#Creating a function for  evaluating the performance of both kernels
def evaluate_svm(X_train, X_test, y_train, y_test, kernel, param1_name, param1_values, param2_name=None, param2_values=None):
    results = []
    for param1_value in param1_values:
        if param2_values:
            for param2_value in param2_values:
                if kernel == 'poly':
                    model = SVC(kernel='poly', degree=param1_value, C=param2_value)
                elif kernel == 'rbf':
                    model = SVC(kernel='rbf', gamma=param1_value, C=param2_value)
                model.fit(X_train, y_train)
                accuracy = accuracy_score(y_test, model.predict(X_test))
                results.append((param1_name, param1_value, param2_name, param2_value, accuracy))
        else:
            if kernel == 'poly':
                model = SVC(kernel='poly', degree=param1_value)
            elif kernel == 'rbf':
                model = SVC(kernel='rbf', gamma=param1_value)
            model.fit(X_train, y_train)
            accuracy = accuracy_score(y_test, model.predict(X_test))
            results.append((param1_name, param1_value, accuracy))
    return results

# Define parameter values to try
C_values = [0.1, 1, 10]
degree_values = [1, 2, 3]
sigma_values = [0.1, 1, 10]

# Task 2: Vary both C and degree for polynomial kernel
poly_results = evaluate_svm(X_train, X_test, y_train, y_test, kernel='poly', param1_name='degree', param1_values=degree_values, param2_name='C', param2_values=C_values)
print("Polynomial Kernel Results:")
for result in poly_results:
    print(result)

# Task 3: Vary both C and sigma for Gaussian kernel
gaussian_results = evaluate_svm(X_train, X_test, y_train, y_test, kernel='rbf', param1_name='sigma', param1_values=sigma_values, param2_name='C', param2_values=C_values)
print("\nGaussian Kernel Results:")
for result in gaussian_results:
    print(result)


Q3.
**Findings**

*   Varying C value:  We tried three different values of C: 0.1, 1, and 10. As C increases from 0.1 to 1 and then to 10, the accuracy of the SVM model may increase if the model doesn't overfit.
*   Varying Sigma value:  As sigma increases from 0.1 to 1 and then to 10, the decision boundary might become more complex, but it could also lead to overfitting if not properly controlled.

*   Varying C value: Increasing C generally results in more complex decision boundaries, which can better fit the training data but may not generalize well to unseen data.
*   Varying Degree value: We explored three different values of degree: 1, 2, and 3. As the degree increases, the decision boundary becomes more flexible and can fit more complex patterns in the data.





In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

dataset = pd.read_csv("dataset-q-4.csv")

# Separate features (X) and labels (y)
X = dataset.drop(columns=['label'])
y = dataset['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {'C': [0.1, 1, 10],
              'gamma': [0.1, 1, 10]}
svm = SVC(kernel='rbf')

grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train, y_train)

optimal_C = grid_search.best_params_['C']
optimal_sigma = grid_search.best_params_['gamma']

optimal_svm = SVC(kernel='rbf', C=optimal_C, gamma=optimal_sigma)
optimal_svm.fit(X_train, y_train)

y_pred = optimal_svm.predict(X_test)
evaluation_result = accuracy_score(y_test, y_pred)

print("Optimal value of C:", optimal_C)
print("Optimal value of sigma:", optimal_sigma)
print("Evaluation result (accuracy) on the testing data:", evaluation_result)

