In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.read_csv('../data/Anuran_Calls_(MFCCs)/Frogs_MFCCs.csv')
data
x_data = data.iloc[:, 0:22]
x_data
y_data = data.iloc[:,22:25]
y_data

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)
y_train

Unnamed: 0,Family,Genus,Species
2004,Leptodactylidae,Adenomera,AdenomeraHylaedactylus
1194,Dendrobatidae,Ameerega,Ameeregatrivittata
5359,Hylidae,Hypsiboas,HypsiboasCinerascens
1756,Leptodactylidae,Adenomera,AdenomeraHylaedactylus
497,Leptodactylidae,Adenomera,AdenomeraAndre
...,...,...,...
3772,Leptodactylidae,Adenomera,AdenomeraHylaedactylus
5191,Hylidae,Hypsiboas,HypsiboasCinerascens
5226,Hylidae,Hypsiboas,HypsiboasCinerascens
5390,Hylidae,Hypsiboas,HypsiboasCinerascens


## 1(b)i

Exact match: sklearn.metrics.accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None). Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.(https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)

Hamming loss: sklearn.metrics.hamming_loss(y_true, y_pred, *, sample_weight=None).Compute the average Hamming loss.The Hamming loss is the fraction of labels that are incorrectly predicted. (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html)

## 1(b)ii

In [69]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score

y_train_F = y_train['Family']
y_test_F = y_test['Family']

y_train_G = y_train['Genus']
y_test_G = y_test['Genus']

y_train_S = y_train['Species']
y_test_S = y_test['Species']

c_list = []
for i in range(-3,3):
    c_list.append(10**i)
    
gamma_list = np.logspace(-3,3,10)    
svm= SVC(gamma='auto')
parameters = {'kernel':['rbf'],'gamma': gamma_list, 'C':c_list}
clf_F = GridSearchCV(svm, parameters, cv=10).fit(x_train, y_train_F)
clf_G = GridSearchCV(svm, parameters, cv=10).fit(x_train, y_train_G)
clf_S = GridSearchCV(svm, parameters, cv=10).fit(x_train, y_train_S)
print(f'best C and best gamma for Families labels are:', clf_F.best_params_)
print(f'best C and best gamma for Genus labels are:',clf_G.best_params_)
print(f'best C and best gamma for Species labels are:',clf_S.best_params_)

C_F = clf_F.best_params_['C']
gamma_F = clf_F.best_params_['gamma']
C_G = clf_F.best_params_['C']
gamma_G = clf_F.best_params_['gamma']
C_S = clf_F.best_params_['C']
gamma_S = clf_F.best_params_['gamma']

svm_F = SVC(C = C_F, gamma = gamma_F, kernel='rbf', random_state=1).fit(x_train, y_train_F)
svm_G = SVC(C = C_G, gamma = gamma_G, kernel='rbf', random_state=1).fit(x_train, y_train_G)
svm_S= SVC(C = C_S, gamma = gamma_S, kernel='rbf', random_state=1).fit(x_train, y_train_S)
pred_F = svm_F.predict(x_test)
pred_G = svm_G.predict(x_test)
pred_S = svm_S.predict(x_test)

print(f'hamming loss for Families labels is:', hamming_loss(y_test_F, pred_F))
print(f'hamming loss for Genus labels is:',hamming_loss(y_test_G, pred_G))
print(f'hamming loss for Species labels is:',hamming_loss(y_test_S, pred_S))
print(f'exact match for Families labels is:', accuracy_score(y_test_F, pred_F))
print(f'exact match for Genus labels is:',accuracy_score(y_test_G, pred_G))
print(f'exact match for Species labels is:',accuracy_score(y_test_S, pred_S))

best C and best gamma for Families labels are: {'C': 10, 'gamma': 2.154434690031882, 'kernel': 'rbf'}
best C and best gamma for Genus labels are: {'C': 10, 'gamma': 2.154434690031882, 'kernel': 'rbf'}
best C and best gamma for Species labels are: {'C': 10, 'gamma': 2.154434690031882, 'kernel': 'rbf'}
hamming loss for Families labels is: 0.00555812876331635
hamming loss for Genus labels is: 0.010653080129689671
hamming loss for Species labels is: 0.010653080129689671
exact match for Families labels is: 0.9944418712366836
exact match for Genus labels is: 0.9893469198703103
exact match for Species labels is: 0.9893469198703103


In [71]:
hamming_loss_qb2 =  (hamming_loss(y_test_F, pred_F) + hamming_loss(y_test_G, pred_G) + hamming_loss(y_test_S, pred_S))/3
hamming_loss_qb2 # average hamming loss

0.00895476300756523

In [72]:
exact_match_qb2 = (accuracy_score(y_test_F, pred_F) + accuracy_score(y_test_G, pred_G) + accuracy_score(y_test_S, pred_S))/3
exact_match_qb2 # average exact match value

0.9910452369924347

## 1(b)iii

In [4]:
from sklearn.svm import LinearSVC
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

scaler = preprocessing.StandardScaler()
stand_x_train = scaler.fit_transform(x_train)

parameters = {'C':c_list}
lin_svc = LinearSVC(penalty='l1', dual=False, random_state=1)

lin_clf_F = GridSearchCV(lin_svc, parameters, cv=10).fit(stand_x_train, y_train_F)
lin_clf_G = GridSearchCV(lin_svc, parameters, cv=10).fit(stand_x_train, y_train_G)
lin_clf_S = GridSearchCV(lin_svc, parameters, cv=10).fit(stand_x_train, y_train_S)

lin_C_F = lin_clf_F.best_params_['C']
lin_C_G = lin_clf_G.best_params_['C']
lin_C_S = lin_clf_G.best_params_['C']

svm_F = LinearSVC(penalty='l1', C = lin_C_F, dual=False, random_state=1).fit(stand_x_train, y_train_F)
svm_G = LinearSVC(penalty='l1', C = lin_C_G, dual=False, random_state=1).fit(stand_x_train, y_train_G)
svm_S= LinearSVC(penalty='l1', C = lin_C_S, dual=False, random_state=1).fit(stand_x_train, y_train_S)

stand_x_test = scaler.fit_transform(x_test)

pred_F = svm_F.predict(stand_x_test)
pred_G = svm_G.predict(stand_x_test)
pred_S = svm_S.predict(stand_x_test)

print(f'hamming loss for Families labels is:',hamming_loss(y_test_F, pred_F))
print(f'hamming loss for Genus labels is:',hamming_loss(y_test_G, pred_G))
print(f'hamming loss for Species labels is:',hamming_loss(y_test_S, pred_S))
print(f'exact match for Families labels is:',accuracy_score(y_test_F, pred_F))
print(f'exact match for Genus labels is:',accuracy_score(y_test_G, pred_G))
print(f'exact match for Species labels is:',accuracy_score(y_test_S, pred_S))

hamming loss for Families labels is: 0.07410838351088467
hamming loss for Genus labels is: 0.06113941639647985
hamming loss for Species labels is: 0.04075961093098657
exact match for Families labels is: 0.9258916164891153
exact match for Genus labels is: 0.9388605836035201
exact match for Species labels is: 0.9592403890690134


In [35]:
print(f'best C for Families labels is:', lin_C_F)
print(f'best C for Genus labels is:',lin_C_G)
print(f'best C for Species labels is:',lin_C_S)

best C for Families labels is: 1
best C for Genus labels is: 10
best C for Species labels is: 10


In [65]:
hamming_loss_qb3 =  (hamming_loss(y_test_F, pred_F) + hamming_loss(y_test_G, pred_G) + hamming_loss(y_test_S, pred_S))/3
hamming_loss_qb3 # average hamming loss

0.05866913694611703

In [66]:
exact_match_qb3 = (accuracy_score(y_test_F, pred_F) + accuracy_score(y_test_G, pred_G) + accuracy_score(y_test_S, pred_S))/3
exact_match_qb3 # average exact match value

0.9413308630538829

## 1(b)iv

In [6]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()

smote_x_train_F, smote_y_train_F = sm.fit_resample(stand_x_train, y_train_F)
smote_x_train_G, smote_y_train_G = sm.fit_resample(stand_x_train, y_train_G)
smote_x_train_S, smote_y_train_S = sm.fit_resample(stand_x_train, y_train_S)

parameters = {'C':c_list}
lin_svc = LinearSVC(penalty='l1', dual=False, random_state=1)

smote_clf_F = GridSearchCV(lin_svc, parameters, cv=10).fit(smote_x_train_F, smote_y_train_F)
smote_clf_G = GridSearchCV(lin_svc, parameters, cv=10).fit(smote_x_train_G, smote_y_train_G)
smote_clf_S = GridSearchCV(lin_svc, parameters, cv=10).fit(smote_x_train_S, smote_y_train_S)

smote_C_G = lin_clf_F.best_params_['C']
smote_C_G = lin_clf_G.best_params_['C']
smote_C_S = lin_clf_G.best_params_['C']

smote_svm_F = LinearSVC(penalty='l1', C = lin_C_F, dual=False, random_state=1).fit(smote_x_train_F, smote_y_train_F)
smote_svm_G = LinearSVC(penalty='l1', C = lin_C_G, dual=False, random_state=1).fit(smote_x_train_G, smote_y_train_G)
smote_svm_S= LinearSVC(penalty='l1', C = lin_C_S, dual=False, random_state=1).fit(smote_x_train_S, smote_y_train_S)


smote_pred_F = smote_svm_F.predict(stand_x_test)
smote_pred_G = smote_svm_G.predict(stand_x_test)
smote_pred_S = smote_svm_S.predict(stand_x_test)

print(f'hamming loss for Families labels is:', hamming_loss(y_test_F, smote_pred_F))
print(f'hamming loss for Genus labels is:', hamming_loss(y_test_G, smote_pred_G))
print(f'hamming loss for Species labels is:', hamming_loss(y_test_S, smote_pred_S))
print(f'exact match for Families labels is:', accuracy_score(y_test_F, smote_pred_F))
print(f'exact match for Genus labels is:', accuracy_score(y_test_G, smote_pred_G))
print(f'exact match for Species labels is:', accuracy_score(y_test_S, smote_pred_S))

hamming loss for Families labels is: 0.09263547938860583
hamming loss for Genus labels is: 0.09031959240389069
hamming loss for Species labels is: 0.042612320518758684
exact match for Families labels is: 0.9073645206113942
exact match for Genus labels is: 0.9096804075961094
exact match for Species labels is: 0.9573876794812413


In [27]:
print(f'best C for Families labels is:', smote_C_G)
print(f'best C for Genus labels is:',smote_C_G)
print(f'best C for Species labels is:',smote_C_S)

best C for Families labels is: 10
best C for Genus labels is: 10
best C for Species labels is: 10


In [67]:
hamming_loss_qb4 = (hamming_loss(y_test_F, smote_pred_F) + hamming_loss(y_test_G, smote_pred_G) + hamming_loss(y_test_S, smote_pred_S))/3
hamming_loss_qb4 # average hamming loss

0.0751891307704184

In [68]:
exact_match_qb4 = (accuracy_score(y_test_F, smote_pred_F) + accuracy_score(y_test_G, smote_pred_G) + accuracy_score(y_test_S, smote_pred_S))/3
exact_match_qb4 # average exact match value

0.9248108692295816

Compare the answer with (b)ii,(b)iii, the value of hamming loss of families labels is bigger than hamming loss of families labels in(b)ii and (b)iii; the value of hamming loss of Genus labels is bigger than hamming loss of Genus labels in (b)iii and smaller than hamming loss of Genus labels in (b)ii.the value of hamming loss of Species labels is bigger than hamming loss of Genus labels in (b)iii and smaller than hamming loss of Specise labels in (b)ii. All in all, the hamming loss is biggest.

## 2(a)(b)(c)

In [24]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import hamming_loss

k_list = np.arange(2,51)
dic_bestk ={}
Hamming_distance_list= []
Hamming_score_list= []
Hamming_loss_list= []

for i in range(50):
    score_list = []
    for j in k_list:
        kmeans = KMeans(n_clusters=j, random_state=i).fit(x_data)
        labels = kmeans.labels_
        score = silhouette_score(x_data, labels)
        tuple1 = (score,j)
        score_list.append(tuple1)
    score_list.sort()
    dic_bestk[i] = score_list[-1][1]
    print(dic_bestk)
    
    best_k = score_list[-1][1]
    kmeans = KMeans(n_clusters=best_k, random_state=i).fit(x_data)
    labels1 = kmeans.labels_
    
    dic ={}
    labels = pd.Series(labels)
    y_data_F = y_data['Family']
    y_data_G = y_data['Genus']
    y_data_S = y_data['Species']
    
    cluster_majority = {}
    for i in range(best_k):
        dic[i] = labels.index[labels==i].tolist()
        new_y_data_F = y_data_F[dic[i]]
        F_number = new_y_data_F.value_counts()
        #print(F_number)
        majority_F = F_number.index[0] 
    
        new_y_data_G = y_data_G[dic[i]]
        G_number = new_y_data_G.value_counts()
        majority_G = G_number.index[0] 
    
        new_y_data_S = y_data_S[dic[i]]
        S_number = new_y_data_S.value_counts()
        majority_S = S_number.index[0] 
        
        cluster_majority[i] = (majority_F,majority_G,majority_S)
        
        print('cluster:', i)
        print('The Majority for Family:',majority_F)
        print('The Majority for Genus:',majority_S)
        print('The Majority for Species:',majority_G)
        
    clusters = pd.concat([x_data,y_data,pd.DataFrame({'labels':labels1.tolist()})],axis = 1)    
    for x in range(4):
        predict_F = [cluster_majority[x][0] for x in clusters['labels']]
        predict_G = [cluster_majority[x][1] for x in clusters['labels']]
        predict_S = [cluster_majority[x][2] for x in clusters['labels']]    
    
    hamming_loss_F = hamming_loss(y_data_F, predict_F)
    hamming_loss_G = hamming_loss(y_data_G, predict_G)
    hamming_loss_S = hamming_loss(y_data_S, predict_S)
    hamming_loss1 = (hamming_loss_F+hamming_loss_G+hamming_loss_S)/3
    Hamming_loss_list.append(hamming_loss1)
    
    hamming_score = 1 - hamming_loss1
    Hamming_score_list.append(hamming_score)

    Hamming_distance = hamming_loss1 
    Hamming_distance_list.append(Hamming_distance)

{0: 4}
cluster: 0
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCordobae
The Majority for Species: Hypsiboas
cluster: 1
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera
cluster: 2
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraAndre
The Majority for Species: Adenomera
cluster: 3
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCinerascens
The Majority for Species: Hypsiboas
{0: 4, 1: 4}
cluster: 0
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera
cluster: 1
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCordobae
The Majority for Species: Hypsiboas
cluster: 2
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCinerascens
The Majority for Species: Hypsiboas
cluster: 3
The Majority for Family: Bufonidae
The Majority for Genus: Rhinellagranulosa
T

{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4}
cluster: 0
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera
cluster: 1
The Majority for Family: Leptodactylidae
The Majority for Genus: LeptodactylusFuscus
The Majority for Species: Leptodactylus
cluster: 2
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCinerascens
The Majority for Species: Hypsiboas
cluster: 3
The Majority for Family: Dendrobatidae
The Majority for Genus: Ameeregatrivittata
The Majority for Species: Ameerega
{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4}
cluster: 0
The Majority for Family: Dendrobatidae
The Majority for Genus: Ameeregatrivittata
The Majority for Species: Ameerega
cluster: 1
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera


{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4}
cluster: 0
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCinerascens
The Majority for Species: Hypsiboas
cluster: 1
The Majority for Family: Hylidae
The Majority for Genus: HylaMinuta
The Majority for Species: Dendropsophus
cluster: 2
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCordobae
The Majority for Species: Hypsiboas
cluster: 3
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera
{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4}
cluster: 0
The Majority for Family: Hylidae
The Majority for Genus: HylaMinuta
The Majority for Species:

{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 31: 4, 32: 4, 33: 4, 34: 4, 35: 4, 36: 4, 37: 4, 38: 4, 39: 4, 40: 4}
cluster: 0
The Majority for Family: Leptodactylidae
The Majority for Genus: AdenomeraHylaedactylus
The Majority for Species: Adenomera
cluster: 1
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCordobae
The Majority for Species: Hypsiboas
cluster: 2
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCordobae
The Majority for Species: Hypsiboas
cluster: 3
The Majority for Family: Hylidae
The Majority for Genus: HypsiboasCinerascens
The Majority for Species: Hypsiboas
{0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 31: 4, 32:

Each time of Majority for each lable in cluster  is shown above

In [70]:
print(f"best k fo each times:\n",dic_bestk)

best k fo each times:
 {0: 4, 1: 4, 2: 4, 3: 4, 4: 4, 5: 4, 6: 4, 7: 4, 8: 4, 9: 4, 10: 4, 11: 4, 12: 4, 13: 4, 14: 4, 15: 4, 16: 4, 17: 4, 18: 4, 19: 4, 20: 4, 21: 4, 22: 4, 23: 4, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 31: 4, 32: 4, 33: 4, 34: 4, 35: 4, 36: 4, 37: 4, 38: 4, 39: 4, 40: 4, 41: 4, 42: 4, 43: 4, 44: 4, 45: 4, 46: 4, 47: 4, 48: 4, 49: 4}


In [25]:
average_Hamming_distance = np.mean(Hamming_distance_list)
print("Average_Hamming_distance", average_Hamming_distance)
std_Hamming_distance = np.std(Hamming_distance_list)
print("Std_Hamming_distance", std_Hamming_distance)
#print("Hamming_distance list is:", Hamming_distance_list)

average_Hamming_score = np.mean(Hamming_score_list)
print("\nAverage_Hamming_score", average_Hamming_score)
#print("Hamming_score list is:", Hamming_score_list)

average_Hamming_loss = np.mean(Hamming_loss_list)
print("\nAverage_Hamming_loss", average_Hamming_loss)
#print("Hamming_loss list is:", Hamming_loss_list)

Average_Hamming_distance 0.6280416956219596
Std_Hamming_distance 0.24212498779888506

Average_Hamming_score 0.37195830437804034

Average_Hamming_loss 0.6280416956219596


In [61]:
tab = np.array([Hamming_distance_list, Hamming_score_list,Hamming_loss_list])
table = pd.DataFrame(tab, index = ['Hamming_distance_list', 'Hamming_score_list', 'Hamming_loss_list'], columns =range(len(Hamming_score_list)))
table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
Hamming_distance_list,0.857772,0.973037,0.417975,0.460459,0.821265,0.812277,0.779754,0.407552,0.446606,0.398981,...,0.298031,0.564698,0.781469,0.9041,0.441974,0.445958,0.619412,0.844244,0.986889,0.352791
Hamming_score_list,0.142228,0.026963,0.582025,0.539541,0.178735,0.187723,0.220246,0.592448,0.553394,0.601019,...,0.701969,0.435302,0.218531,0.0959,0.558026,0.554042,0.380588,0.155756,0.013111,0.647209
Hamming_loss_list,0.857772,0.973037,0.417975,0.460459,0.821265,0.812277,0.779754,0.407552,0.446606,0.398981,...,0.298031,0.564698,0.781469,0.9041,0.441974,0.445958,0.619412,0.844244,0.986889,0.352791


## ISLR 12.6.2

![IMG_0222.jpg](attachment:IMG_0222.jpg)