# Exploring Multi-class and Multi-Label Classification with Support Vector Machines

Import packages

In [40]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, hamming_loss
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

### (a) Dataset Retrieval:

Access the Anuran Calls (MFCCs) Data Set from the UCI Machine Learning Repository at:
https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29
. Randomly designate 70% of the dataset as the training set.

In [3]:
file_path = '../hw7/Frogs_MFCCs.csv'
data = pd.read_csv(file_path)
X = data.drop(columns=['Family', 'Genus', 'Species'])
y = data[['Family', 'Genus', 'Species']]
train_size = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_size, random_state=42
)

### (b) Advanced Classification Strategies:

#### (i) Metric Assessment:

Conduct an in-depth exploration of exact match and Hamming score/loss methods tailored for evaluating multi-label classification. Apply these methodologies to assess classifiers trained for each label using a binary relevance approach.

In [4]:
y_train_family = (y_train['Family'] == 'Leptodactylidae').astype(int)
y_train_genus = (y_train['Genus'] == 'Adenomera').astype(int)
y_train_species = (y_train['Species'] == 'AdenomeraAndre').astype(int)

svm_family = SVC()
svm_genus = SVC()
svm_species = SVC()

svm_family.fit(X_train, y_train_family)
svm_genus.fit(X_train, y_train_genus)
svm_species.fit(X_train, y_train_species)

y_pred_family = svm_family.predict(X_test)
y_pred_genus = svm_genus.predict(X_test)
y_pred_species = svm_species.predict(X_test)

hamming_loss_family = hamming_loss(y_test['Family'], y_pred_family)
hamming_loss_genus = hamming_loss(y_test['Genus'], y_pred_genus)
hamming_loss_species = hamming_loss(y_test['Species'], y_pred_species)

exact_match_ratio = ((y_pred_family == y_test['Family']) &
                     (y_pred_genus == y_test['Genus']) &
                     (y_pred_species == y_test['Species'])).mean()

print(f"Hamming Loss (Family): {hamming_loss_family}")
print(f"Hamming Loss (Genus): {hamming_loss_genus}")
print(f"Hamming Loss (Species): {hamming_loss_species}")
print(f"Exact Match Ratio: {exact_match_ratio}")

Hamming Loss (Family): 1.0
Hamming Loss (Genus): 1.0
Hamming Loss (Species): 1.0
Exact Match Ratio: 0.0


#### (ii) Train a SVM for each of the labels

Initiate the training of Support Vector Machines (SVMs) for each label, employing Gaussian kernels and adopting one-versus-all classifiers. The determination of SVM penalty weight and Gaussian Kernel width will be achieved through rigorous 10-fold cross-validation. Experiments will be conducted with both standardized and raw attributes.

In [64]:
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, train_size=0.7, random_state=42)

param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]}
svm_family = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=10)
svm_genus = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=10)
svm_species = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=10)

svm_family.fit(X_train, y_train['Family'])
svm_genus.fit(X_train, y_train['Genus'])
svm_species.fit(X_train, y_train['Species'])

best_c_family = svm_family.best_params_['C']
best_gamma_family = svm_family.best_params_['gamma']

best_c_genus = svm_genus.best_params_['C']
best_gamma_genus = svm_genus.best_params_['gamma']

best_c_species = svm_species.best_params_['C']
best_gamma_species = svm_species.best_params_['gamma']
y_pred_family = svm_family.predict(X_test)
y_pred_genus = svm_genus.predict(X_test)
y_pred_species = svm_species.predict(X_test)

hamming_loss_family = hamming_loss(y_test['Family'], y_pred_family)
hamming_loss_genus = hamming_loss(y_test['Genus'], y_pred_genus)
hamming_loss_species = hamming_loss(y_test['Species'], y_pred_species)

exact_match_ratio = ((y_pred_family == y_test['Family']) &
                     (y_pred_genus == y_test['Genus']) &
                     (y_pred_species == y_test['Species'])).mean()

print(f"Hamming Loss (Family): {hamming_loss_family}")
print(f"Hamming Loss (Genus): {hamming_loss_genus}")
print(f"Hamming Loss (Species): {hamming_loss_species}")
print(f"Exact Match Ratio: {exact_match_ratio}")

Hamming Loss (Family): 0.0041685965724872626
Hamming Loss (Genus): 0.0037054191755442334
Hamming Loss (Species): 0.004631773969430292
Exact Match Ratio: 0.9925891616489115


#### (iii)  L1-Penalized SVMs:

Repeat SVM training, introducing L1-penalized SVMs into the mix. Standardize attributes and ascertain the penalty weight through meticulous 10-fold cross-validation.

In [41]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 100]
}

l1_svm_family = GridSearchCV(LinearSVC(penalty='l1', dual=False, max_iter=10000), param_grid, cv=10)
l1_svm_genus = GridSearchCV(LinearSVC(penalty='l1', dual=False, max_iter=10000), param_grid, cv=10)
l1_svm_species = GridSearchCV(LinearSVC(penalty='l1', dual=False, max_iter=10000), param_grid, cv=10)

l1_svm_family.fit(X_train, y_train['Family'])
l1_svm_genus.fit(X_train, y_train['Genus'])
l1_svm_species.fit(X_train, y_train['Species'])

best_c_l1_family = l1_svm_family.best_params_['C']
best_c_l1_genus = l1_svm_genus.best_params_['C']
best_c_l1_species = l1_svm_species.best_params_['C']

print("Best C (penalty) for L1-penalized SVMs:")
print(f"C (Family): {best_c_l1_family}")
print(f"C (Genus): {best_c_l1_genus}")
print(f"C (Species): {best_c_l1_species}")

Best C (penalty) for L1-penalized SVMs:
C (Family): 100
C (Genus): 100
C (Species): 100


#### (iv) Tackling Class Imbalance:

Extend the L1-penalized SVM training to address class imbalance, utilizing methodologies such as SMOTE or other suitable techniques. Subsequently, draw insightful conclusions based on the performance of the trained classifiers.

In [8]:
smote = SMOTE(random_state=42)

X_resampled_family, y_resampled_family = smote.fit_resample(X_train, y_train_family)
X_resampled_genus, y_resampled_genus = smote.fit_resample(X_train, y_train_genus)
X_resampled_species, y_resampled_species = smote.fit_resample(X_train, y_train_species)

svm_family_smote = SVC()
svm_genus_smote = SVC()
svm_species_smote = SVC()

svm_family_smote.fit(X_resampled_family, y_resampled_family)
svm_genus_smote.fit(X_resampled_genus, y_resampled_genus)
svm_species_smote.fit(X_resampled_species, y_resampled_species)

y_pred_family_smote = svm_family_smote.predict(X_test)
y_pred_genus_smote = svm_genus_smote.predict(X_test)
y_pred_species_smote = svm_species_smote.predict(X_test)

hamming_loss_family_smote = hamming_loss(y_test['Family'], y_pred_family_smote)
hamming_loss_genus_smote = hamming_loss(y_test['Genus'], y_pred_genus_smote)
hamming_loss_species_smote = hamming_loss(y_test['Species'], y_pred_species_smote)

print("Hamming Loss with SMOTE:")
print(f"Hamming Loss (Family): {hamming_loss_family_smote}")
print(f"Hamming Loss (Genus): {hamming_loss_genus_smote}")
print(f"Hamming Loss (Species): {hamming_loss_species_smote}")

Hamming Loss with SMOTE:
Hamming Loss (Family): 1.0
Hamming Loss (Genus): 1.0
Hamming Loss (Species): 1.0


## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### (a) k-means clustering:

Apply the k-means clustering algorithm to the entire Anuran Calls (MFCCs) Data Set without the conventional train-test data split. The optimal value of k will be automatically determined from the set {1, 2, ..., 50} using established methods like CH, Gap Statistics, scree plots, Silhouettes, or any other recognized approach.

In [19]:
file_path = '../hw7/Frogs_MFCCs.csv'
data = pd.read_csv(file_path)
X = data.drop(columns=['Family', 'Genus', 'Species'])
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

num_simulations = 50
silhouette_scores = []

for _ in range(num_simulations):
    for k in range(2, 51):  # Start with k=2 to ensure at least two clusters
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_standardized)
        cluster_labels = kmeans.labels_
        silhouette_avg = silhouette_score(X_standardized, cluster_labels)
        silhouette_scores.append(silhouette_avg)

average_silhouette_score = np.mean(silhouette_scores)
std_deviation_silhouette_score = np.std(silhouette_scores)

print(f"Average Silhouette Score over {num_simulations} simulations: {average_silhouette_score}")
print(f"Standard Deviation of Silhouette Score over {num_simulations} simulations: {std_deviation_silhouette_score}")

Average Silhouette Score over 50 simulations: 0.25224160686272257
Standard Deviation of Silhouette Score over 50 simulations: 0.03651893756582283


### (b) Majority Label Identification:

Within each cluster, ascertain the majority labels for family, genus, and species by referencing the true labels.

In [21]:
k = 5

file_path = 'Frogs_MFCCs.csv'
data = pd.read_csv(file_path)
X = data.drop(columns=['Family', 'Genus', 'Species'])
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_standardized)
data['Cluster'] = kmeans.labels_

def majority_label(cluster_df, label_column):
    counts = cluster_df[label_column].value_counts()
    majority_label = counts.idxmax()
    return majority_label

majority_families = []
majority_genus = []
majority_species = []

for cluster in range(k):
    cluster_data = data[data['Cluster'] == cluster]
    
    majority_family = majority_label(cluster_data, 'Family')
    majority_genus_cluster = majority_label(cluster_data, 'Genus')
    majority_species_cluster = majority_label(cluster_data, 'Species')
    
    majority_families.append(majority_family)
    majority_genus.append(majority_genus_cluster)
    majority_species.append(majority_species_cluster)
for cluster in range(k):
    print(f"Cluster {cluster + 1} - Majority Family: {majority_families[cluster]}, Majority Genus: {majority_genus[cluster]}, Majority Species: {majority_species[cluster]}")

Cluster 1 - Majority Family: Leptodactylidae, Majority Genus: Adenomera, Majority Species: AdenomeraHylaedactylus
Cluster 2 - Majority Family: Hylidae, Majority Genus: Hypsiboas, Majority Species: HypsiboasCinerascens
Cluster 3 - Majority Family: Leptodactylidae, Majority Genus: Adenomera, Majority Species: AdenomeraAndre
Cluster 4 - Majority Family: Hylidae, Majority Genus: Hypsiboas, Majority Species: HypsiboasCordobae
Cluster 5 - Majority Family: Leptodactylidae, Majority Genus: Adenomera, Majority Species: AdenomeraAndre


### (c) Hamming Distance Analysis:

Compute the average Hamming distance, Hamming score, and Hamming loss between the true labels and labels assigned by clusters for each cluster, providing insights into the efficacy of the K-Means Clustering approach in a multi-class and multi-label context.

In [30]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_standardized)

data['Cluster'] = kmeans.labels_

def calculate_hamming_metrics(cluster_df, label_column, majority_labels):
    true_labels = cluster_df[label_column].values
    cluster_labels = np.array([majority_labels[cluster_df['Cluster'].iloc[i]] for i in range(len(cluster_df))])
    
    hamming_distance = hamming_loss(true_labels, cluster_labels)
    hamming_score = 1.0 - hamming_distance
    
    return hamming_distance, hamming_score

majority_labels = {}

for label_column in ['Family', 'Genus', 'Species']:
    majority_labels[label_column] = []
    for cluster in range(k):
        cluster_data = data[data['Cluster'] == cluster]
        majority_label = cluster_data[label_column].mode().values[0]
        majority_labels[label_column].append(majority_label)

cluster_labels_df = data[['Family', 'Genus', 'Species', 'Cluster']]

hamming_distance_dict = {}
hamming_score_dict = {}

for label_column in ['Family', 'Genus', 'Species']:
    hamming_distance_list = []
    hamming_score_list = []

    for cluster in range(k):
        cluster_data = cluster_labels_df[cluster_labels_df['Cluster'] == cluster]
        hamming_distance, hamming_score = calculate_hamming_metrics(cluster_data, label_column, majority_labels[label_column])

        hamming_distance_list.append(hamming_distance)
        hamming_score_list.append(hamming_score)

    average_hamming_distance = np.mean(hamming_distance_list)
    average_hamming_score = np.mean(hamming_score_list)

    hamming_distance_dict[label_column] = average_hamming_distance
    hamming_score_dict[label_column] = average_hamming_score

for label_column in ['Family', 'Genus', 'Species']:
    print(f"Average Hamming Distance ({label_column}): {hamming_distance_dict[label_column]}")
    print(f"Average Hamming Score ({label_column}): {hamming_score_dict[label_column]}")

Average Hamming Distance (Family): 0.24500646025791079
Average Hamming Score (Family): 0.7549935397420893
Average Hamming Distance (Genus): 0.3062812003914012
Average Hamming Score (Genus): 0.6937187996085987
Average Hamming Distance (Species): 0.3272723139624235
Average Hamming Score (Species): 0.6727276860375765
