#1)Installing the needed Libraries :

In [26]:
!pip install mlxtend
!pip install cvxopt
!pip install cvxpy



#2)Importing needed Libraries :

In [68]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from cvxopt import matrix, solvers
import cvxpy as cp
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_svmlight_file
from sklearn.metrics import accuracy_score

#3)Saving Paths of Data Sets into Variables :

In [69]:
data_paths=[
    '/content/Aggregation.txt',
    '/content/Compound.txt',
    '/content/Flame.txt',
    '/content/Jain.txt',
    '/content/Pathbased.txt' ,
    '/content/Spiral.txt',
    ]


#4)Function to load and preprocess data :

In [70]:
def load_and_preprocess_data(data_path):
    # reading each data set from it's Path and Skipping the First 7 lines in each data set file
    df = pd.read_csv(data_path, sep="\t", skiprows=7, header=None, names=['feature1', 'feature2', 'label'])
    # Saving the first two columns in x and the last one in y in which it represents the features and Labels
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    # Spliting the data into training and testing data with 20 % for testing and the other percentage for training
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # making the features in equal weights using standard scaler
    scaler = StandardScaler()
    # Transforming the training and testing data using Scaler
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test,df

#5)List of Kernals and regularization_parameters :



In [71]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
# C=1.0 (with regularization), C=1e-6 (without regularization)
regularization_params = [1000, 1e-6]

#6)Loop through each dataset and kernal regularization Parameters :

In [72]:
# Loop through datasets, kernels, and regularization parameters
for data_path in data_paths:
    X_train, X_test, y_train, y_test,df= load_and_preprocess_data(data_path)
    # Generate descriptive statistics using pandas describe function
    dataset_description = df.describe().to_string()
    # Showing the description of each data set
    print(f"Dataset: {data_path}\n{dataset_description}")
    for kernel in kernels:
        for C in regularization_params:
            # Create SVM model with regularization for one moment and without in another
            SVM_Model = SVC(kernel=kernel,C=C)
            # Train the model
            SVM_Model.fit(X_train, y_train)
            # Evaluate the model
            Accuracy = SVM_Model.score(X_test, y_test)
            # Comment on the result
            print(f"Dataset: {data_path}, Kernel: {kernel}, Regularization: {'With' if C > 1e-6 else 'Without'}")
            print(f"Accuracy: {Accuracy:.2f}\n")
            # Visualize decision boundaries
            plt.figure()
            # Create a meshgrid to plot decision regions
            # Step size in the mesh
            h = .02
            x_min, x_max = X_test[:, 0].min() - 1, X_test[:, 0].max() + 1
            y_min, y_max = X_test[:, 1].min() - 1, X_test[:, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
            Z = SVM_Model.predict(np.c_[xx.ravel(), yy.ravel()])
            Z = Z.reshape(xx.shape)
            # Plot decision regions
            plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
            # Plot data points
            plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, edgecolors='k', cmap=plt.cm.coolwarm)
            plt.title(f'Dataset: {data_path}, Kernel: {kernel}, Regularization: {"With" if C > 1e-6 else "Without"}')
            plt.xlabel('Feature 1')
            plt.ylabel('Feature 2')
            plt.show()


Output hidden; open in https://colab.research.google.com to view.

# Bonus Part :

# Building SVM function from scratch , and using GridSearchCV with various hyperparameters, kernel options, and regularization techniques to optimize your model's performance.

In [80]:
class CustomSVM:
    def __init__(self, C=1.0):
        self.C = C
        self.kernel = self._linear_kernel
        self.alpha = None
        self.b = None
    def _linear_kernel(self, X1, X2):
        return np.dot(X1, X2.T)
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.X = X
        self.y = y
        self.alpha = np.zeros(n_samples)
        for _ in range(100):
            for i in range(n_samples):
                j = self._select_second_index(i, n_samples)
                if j == i:
                    continue
                E_i = self._decision_function(X[i]) - y[i]
                E_j = self._decision_function(X[j]) - y[j]
                alpha_i_old, alpha_j_old = self.alpha[i], self.alpha[j]
                L, H = self._compute_bounds(y[i], y[j], alpha_i_old, alpha_j_old)
                if L == H:
                    continue
                K_ii = np.dot(X[i], X[i])
                K_ij = np.dot(X[i], X[j])
                K_jj = np.dot(X[j], X[j])
                self.alpha[j] = self._update_alpha_j(alpha_j_old, y[j], E_i, E_j, K_ij, K_ii, K_jj)
                self.alpha[j] = np.clip(self.alpha[j], L, H)
                self.alpha[i] = alpha_i_old + y[i] * y[j] * (alpha_j_old - self.alpha[j])
                b1 = self.b - E_i - y[i] * (self.alpha[i] - alpha_i_old) * K_ii - y[j] * (self.alpha[j] - alpha_j_old) * K_ij
                b2 = self.b - E_j - y[i] * (self.alpha[i] - alpha_i_old) * K_ij - y[j] * (self.alpha[j] - alpha_j_old) * K_jj
                if 0 < self.alpha[i] < self.C:
                    self.b = b1
                elif 0 < self.alpha[j] < self.C:
                    self.b = b2
                else:
                    self.b = (b1 + b2) / 2
    def predict(self, X):
        return np.sign(self._decision_function(X) + self.b)
    def _decision_function(self, X):
        return np.dot(self.alpha * self.y, self._linear_kernel(X, self.X)) - self.b
    def _select_second_index(self, i, n_samples):
        j = i
        while j == i:
            j = np.random.randint(0, n_samples)
        return j
    def _compute_bounds(self, y_i, y_j, alpha_i, alpha_j):
        if y_i != y_j:
            L = max(0, alpha_j - alpha_i)
            H = min(self.C, self.C + alpha_j - alpha_i)
        else:
            L = max(0, alpha_i + alpha_j - self.C)
            H = min(self.C, alpha_i + alpha_j)
        return L, H
    def _update_alpha_j(self, alpha_j_old, y_j, E_i, E_j, K_ij, K_ii, K_jj):
        eta = 2 * K_ij - K_ii - K_jj
        if eta >= 0:
            return alpha_j_old
        return alpha_j_old - y_j * (E_i - E_j) / eta


* Plot Decision_regions



In [74]:
# Function to plot decision regions for a 2D dataset and a given classification model
def plot_decision_region(X, y, model, title='Decision Regions'):
    # Set the step size for the mesh grid
    h = 0.2
    # Determine the minimum and maximum values for the x-axis and y-axis
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    # Create a mesh grid using the specified step size
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict class labels for each point in the mesh grid using the given model
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    # Reshape the predicted labels to match the shape of the mesh grid
    Z = Z.reshape(xx.shape)
    # Create a new figure and axis
    fig, ax = plt.subplots()
    # Plot decision regions using filled contours
    contour = ax.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
    # Scatter plot of the original data points with their true labels
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm, edgecolors='k', marker='o')
    # Set labels for the x-axis and y-axis
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')
    # Set the title of the plot
    ax.set_title(title)
    # Add a colorbar to the plot to indicate the decision regions
    plt.colorbar(contour, ax=ax, label='Decision Regions')
    # Return the figure and axis objects for further customization or display
    return fig, ax


* Defining Train and Apply SVM function :




In [78]:
# Function to train and apply SVM on a given dataset
def train_and_apply_svm(data_path, kernel='linear', gamma='scale', skip_lines=7):
    # Load the dataset from the specified path, skipping the specified number of lines
    data = np.loadtxt(data_path, skiprows=skip_lines)
    # Extract features (X) and labels (y) from the loaded data
    X = data[:, :-1]
    y = data[:, -1]
    # Split the data into training and testing sets using 80% for training and 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Define a parameter grid for hyperparameter tuning using GridSearchCV
    param_grid = {'C': [0.000001, 1, 1000], 'kernel': [kernel], 'gamma': [gamma]}
    # Create an instance of the CustomSVM class
    svm = CustomSVM()
    # Use GridSearchCV to perform hyperparameter tuning with 5-fold cross-validation
    grid_search = GridSearchCV(svm, param_grid, cv=5)
    # Fit the model on the training data to find the best hyperparameters
    grid_search.fit(X_train, y_train)
    # Get the best SVM model based on the hyperparameter tuning
    best_svm_model = grid_search.best_estimator_
    # Plot decision regions for the training data
    plot_decision_region(X_train, y_train, best_svm_model, title='Decision Regions - Training data')
    # Plot decision regions for the testing data
    plot_decision_region(X_test, y_test, best_svm_model, title='Decision Regions - Testing data')
    # Make predictions on the testing data using the best SVM model
    predictions = best_svm_model.predict(X_test)
    # Calculate accuracy by comparing predicted labels with actual labels
    accuracy = accuracy_score(y_test, predictions)
    # Create column names for the descriptive statistics DataFrame
    columns = [f'feature_{i+1}' for i in range(X_train.shape[1])]
    # Create a DataFrame for descriptive statistics on the training data
    df_stats = pd.DataFrame(X_train, columns=columns)
    # Add a column for the target labels in the descriptive statistics DataFrame
    df_stats['target'] = y_train
    # Calculate descriptive statistics for the DataFrame
    descriptive_stats = df_stats.describe()
    # Return the accuracy and descriptive statistics
    return accuracy, descriptive_stats


  * Applying the SVM on the different data sets

In [81]:
# Example usage:
for data_path in data_paths:
    accuracy, descriptive_stats = train_and_apply_svm(data_path)
    print(f"\nDataset: {data_path}")
    print(f"Accuracy on Test set: {accuracy:.2f}")
    print("Descriptive Statistics:")
    print(descriptive_stats)


TypeError: ignored