# HW7

Name: Xulai Wu <br>
Github Username: LukeWu5121 <br>
USC ID: 6591102106

In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline  
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from scipy.stats import mode
from sklearn.metrics import calinski_harabasz_score


## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

### a.

In [26]:
df = pd.read_csv('../Data/Frogs_MFCCs.csv')
print(df.columns)

label_cols = ['Family', 'Genus', 'Species']
x = df.drop(columns=label_cols)
y = df[label_cols]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y['Family'])

Index(['MFCCs_ 1', 'MFCCs_ 2', 'MFCCs_ 3', 'MFCCs_ 4', 'MFCCs_ 5', 'MFCCs_ 6',
       'MFCCs_ 7', 'MFCCs_ 8', 'MFCCs_ 9', 'MFCCs_10', 'MFCCs_11', 'MFCCs_12',
       'MFCCs_13', 'MFCCs_14', 'MFCCs_15', 'MFCCs_16', 'MFCCs_17', 'MFCCs_18',
       'MFCCs_19', 'MFCCs_20', 'MFCCs_21', 'MFCCs_22', 'Family', 'Genus',
       'Species', 'RecordID'],
      dtype='object')


### b.

### i).

In [27]:
def hamming_score(y_true, y_pred):
    return (y_true == y_pred).sum().sum() / (y_true.shape[0] * y_true.shape[1])

def exact_match_score(y_true, y_pred):
    return (y_true == y_pred).all(axis=1).mean()

def hamming_loss_custom(y_true, y_pred):
    return 1 - hamming_score(y_true, y_pred)


### ii).

In [42]:
svm_models = {}
label_encoders = {}
param_grid = {
    'estimator__C': [1e-1, 1, 10, 100],
    #param_grid = [0.1, 1, 10] [0.1, 1]
    'estimator__gamma': [0.1, 0.5, 1]
}

for label in ['Family', 'Genus', 'Species']:
    print(f"\nTraining for label: {label}")
    
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train[label])
    y_test_encoded = le.transform(y_test[label])
    label_encoders[label] = le

    model = make_pipeline(
        StandardScaler(),
        GridSearchCV(
            OneVsRestClassifier(SVC(kernel='rbf')),
            param_grid=param_grid,
            cv=10,
            scoring='accuracy',
            n_jobs=-1
        )
    )

    model.fit(x_train, y_train_encoded)
    best_c = model.named_steps['gridsearchcv'].best_params_['estimator__C']
    best_gamma = model.named_steps['gridsearchcv'].best_params_['estimator__gamma']
    print(f"RBF SVM - Best C for {label}: {best_c}")
    print(f"RBF SVM - Best gamma for {label}: {best_gamma}")
    
    svm_models[label] = model



Training for label: Family
RBF SVM - Best C for Family: 100
RBF SVM - Best gamma for Family: 0.1

Training for label: Genus
RBF SVM - Best C for Genus: 10
RBF SVM - Best gamma for Genus: 0.1

Training for label: Species
RBF SVM - Best C for Species: 10
RBF SVM - Best gamma for Species: 0.1


In [43]:
y_pred = pd.DataFrame(index=y_test.index)

for label in ['Family', 'Genus', 'Species']:
    model = svm_models[label]
    le = label_encoders[label]
    y_pred[label] = le.inverse_transform(model.predict(x_test))
    
print("Exact Match:", exact_match_score(y_test, y_pred))
print("Hamming Score:", hamming_score(y_test, y_pred))
print("Hamming Loss:", hamming_loss_custom(y_test, y_pred))  

Exact Match: 0.9893469198703103
Hamming Score: 0.9922803767176162
Hamming Loss: 0.007719623282383847


### iii).

In [30]:
param_grid_l1 = {
    'estimator__C': [1e-1, 1, 10, 100]
}

l1_models = {}
l1_label_encoders = {}

for label in ['Family', 'Genus', 'Species']:
    print(f"\nTraining L1-penalized SVM for label: {label}")
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train[label])
    y_test_encoded = le.transform(y_test[label])
    l1_label_encoders[label] = le
    
    model = make_pipeline(
        StandardScaler(),
        GridSearchCV(
            OneVsRestClassifier(
                LinearSVC(penalty='l1', dual=False, max_iter=10000)
            ),
            param_grid=param_grid_l1,
            cv=10, scoring='accuracy', n_jobs=-1
        )
    )

    model.fit(x_train, y_train_encoded)
    best_c = model.named_steps['gridsearchcv'].best_params_['estimator__C']
    print(f"L1 SVM - Best C for {label}: {best_c}")
    l1_models[label] = model


Training L1-penalized SVM for label: Family




L1 SVM - Best C for Family: 100

Training L1-penalized SVM for label: Genus




L1 SVM - Best C for Genus: 10

Training L1-penalized SVM for label: Species
L1 SVM - Best C for Species: 10




In [31]:
l1_pred = pd.DataFrame(index=y_test.index)

for label in ['Family', 'Genus', 'Species']:
    model = l1_models[label]
    le = l1_label_encoders[label]
    l1_pred[label] = le.inverse_transform(model.predict(x_test))
    
print("L1-SVM Exact Match:", exact_match_score(y_test, l1_pred))
print("L1-SVM Hamming Score:", hamming_score(y_test, l1_pred))
print("L1-SVM Hamming Loss:", hamming_loss_custom(y_test, l1_pred))

L1-SVM Exact Match: 0.9546086150995832
L1-SVM Hamming Score: 0.9797745870001544
L1-SVM Hamming Loss: 0.020225412999845593


### iv).

In [37]:
param_grid_l1_1 = {
    'estimator__C': [1e-1, 1, 10]
}

smote_models = {}
smote_label_encoders = {}

for label in ['Family', 'Genus', 'Species']:
    print(f"\nTraining for label: {label}")
    
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train[label])
    y_test_encoded = le.transform(y_test[label])
    smote_label_encoders[label] = le

    smote = SMOTE(random_state=42)

    pipe = Pipeline([
        ('smote', smote),
        ('scaler', StandardScaler()),
        ('classifier', GridSearchCV(
            OneVsRestClassifier(LinearSVC(penalty='l1', dual=False, max_iter=10000)),
            param_grid=param_grid_l1_1,
            cv=10, scoring='accuracy', n_jobs=-1
        ))
    ])

    pipe.fit(x_train, y_train_encoded)
    best_c = pipe.named_steps['classifier'].best_params_['estimator__C']
    print(f"SMOTE L1 SVM - Best C for {label}: {best_c}")
    smote_models[label] = pipe


Training for label: Family
SMOTE L1 SVM - Best C for Family: 10

Training for label: Genus
SMOTE L1 SVM - Best C for Genus: 10

Training for label: Species




SMOTE L1 SVM - Best C for Species: 10


In [33]:
smote_pred = pd.DataFrame(index=y_test.index)

for label in ['Family', 'Genus', 'Species']:
    model = smote_models[label]
    le = smote_label_encoders[label]
    smote_pred[label] = le.inverse_transform(model.predict(x_test))

print("SMOTE Exact Match:", exact_match_score(y_test, smote_pred))
print("SMOTE Hamming Score:", hamming_score(y_test, smote_pred))
print("SMOTE Hamming Loss:", hamming_loss_custom(y_test, smote_pred))


SMOTE Exact Match: 0.9421028253821213
SMOTE Hamming Score: 0.974679635633781
SMOTE Hamming Loss: 0.025320364366218984


#### Compare the exact match, hamming score and hamming loss, it did not improve too much.

## 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set Monte-Carlo Simulation 

In [34]:
df = pd.read_csv('../Data/Frogs_MFCCs.csv')
X = df.iloc[:, :22]  # MFCC特征
y = df[['Family', 'Genus', 'Species']]

In [38]:
best_k = None
best_score = -1

for k in range(2, 51):  
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = calinski_harabasz_score(X, labels)
    if score > best_score:
        best_score = score
        best_k = k
print("Best k:", best_k)


Best k: 2


### b&c).

In [39]:
cluster_labels = kmeans.labels_
y_clustered = y.copy()
y_clustered['cluster'] = cluster_labels

def hamming_distance(y_true, y_pred):
    return (y_true != y_pred).sum().sum() / y_true.shape[0]

distances = []
scores = []
losses = []

for i in range(50):
    kmeans = KMeans(n_clusters=best_k, random_state=i)
    cluster_labels = kmeans.fit_predict(X)
    df_clustered = y.copy()
    df_clustered['cluster'] = cluster_labels

    majority_labels = df_clustered.groupby('cluster').agg(lambda x: x.mode()[0])
    pred_labels = df_clustered['cluster'].map(majority_labels.to_dict(orient='index'))

    pred_df = pd.DataFrame(list(pred_labels), index=df_clustered.index)
    pred_df.columns = ['Family', 'Genus', 'Species']

    dist = hamming_distance(y, pred_df)
    score = hamming_score(y, pred_df)
    loss = hamming_loss_custom(y, pred_df)

    distances.append(dist)
    scores.append(score)
    losses.append(loss)

print("Avg Hamming Distance:", np.mean(distances))
print("Avg Hamming Score:", np.mean(scores))
print("Avg Hamming Loss:", np.mean(losses))

Avg Hamming Distance: 0.895621959694232
Avg Hamming Score: 0.7014593467685892
Avg Hamming Loss: 0.2985406532314107


## 3. ISLP 12.6.2

### a).

```text
        0.8
        ├──────────────┐
      0.3             0.45
     ┌─┴─┐           ┌─┴─┐
     1   2           3   4
```

#### First merge 1 and 2, we have 0.3. Next one is 3 and 4 since it's 0.45(smallest). Then 0.8

### b).

```text   
   0.45 
   ├──────────────┐ 
 0.4              4 
 ┌─┴─┐
0.3  3
 ├───┐
 1   2
```

#### 0.3 then 0.4 then 0.45

### c).

#### Only [1,2] and [3,4]

### d).

#### [1,2,3], [4]

### e).

```text
        0.8
        ├──────────────┐
      0.45             0.3
     ┌─┴─┐           ┌─┴─┐
     3   4           1   2
```

#### REF: <br>
https://mmuratarat.github.io/2020-01-25/multilabel_classification_metrics