In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import svm
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
import math

from imblearn.over_sampling import SMOTE



**`Reading file from Gitthub URL`**

In [None]:
url = 'https://raw.githubusercontent.com/Jahnvi-Rc/MPR_HW7_data/master/Frogs_MFCCs.csv'
df = pd.read_csv(url)

 **(a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.
uci.edu/ml/datasets/Anuran+Calls+%28MFCCs). Choose 70% of the data ran-
domly as the training set.**

**(b) Each instance has three labels: Families, Genus, and Species. Each of the labels
has multiple classes. We wish to solve a multi-class and multi-label problem.
One of the most important approaches to multi-class classication is to train a
classifer for each label. We rst try this approach:
i. Research exact match and hamming score/ loss methods for evaluating multi-
label classfcation and use them in evaluating the classifers in this problem.**

In [None]:
X = df.drop(labels=['RecordID', 'Family', 'Genus', 'Species'] , axis=1)
y = pd.DataFrame(df, columns = ['Family', 'Genus', 'Species'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
y_trn1 = y_train['Family']
y_trn2 = y_train['Genus']
y_trn3 = y_train['Species']
y_tst1 = y_test['Family']
y_tst2 = y_test['Genus']
y_tst3 = y_test['Species']

y_trn1 = y_trn1.astype('category')
y_trn2 = y_trn2.astype('category')
y_trn3 = y_trn3.astype('category')
y_trncat1 = y_trn1.cat.codes
y_trncat2 = y_trn2.cat.codes
y_trncat3 = y_trn3.cat.codes
y_tst1 = y_tst1.astype('category')
y_tst2 = y_tst2.astype('category')
y_tst3 = y_tst3.astype('category')
y_tstcat1 = y_tst1.cat.codes
y_tstcat2 = y_tst2.cat.codes
y_tstcat3 = y_tst3.cat.codes
y_tstcat = pd.concat([y_tstcat1, y_tstcat2, y_tstcat3], axis=1)

 **ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all
classifers. Determine the weight of the SVM penalty and the width of the
Gaussian Kernel using 10 fold cross validation.1 You are welcome to try to
solve the problem with both normalized2 and raw attributes and report the
results.**

In [None]:
bscore1 = 0
bscore2 = 0
bscore3 = 0
acc_score1 = list()
acc_score2 = list()
acc_score3 = list()
for x in range(-3,4):
    sigma= .1
    while sigma<=2:
        c=10**x
        g=1/(2*(sigma**2))
        clfr=svm.SVC(C=c, decision_function_shape='ovr', gamma=g)
        acc_score1.append(cross_val_score(clfr, X_train, y_trncat1, cv=10,scoring='accuracy').mean())
        acc_score2.append(cross_val_score(clfr, X_train, y_trncat2, cv=10,scoring='accuracy').mean())
        acc_score3.append(cross_val_score(clfr, X_train, y_trncat3, cv=10,scoring='accuracy').mean())
        acc1=max(acc_score1)
        acc2=max(acc_score2)
        acc3=max(acc_score3)
        sigma=sigma + .1
        if acc1>bscore1:
            bscore1=acc1
            best_c1=c
            best_gamma1=g
        if acc2>bscore2:
            bscore2=acc2
            best_c2=c
            best_gamma2=g
        if acc3>bscore3:
            bscore3=acc3
            best_c3=c
            best_gamma3=g
print('SVM penalty weight-1 classifier', best_c1)
print('Gaussian kernel width-1st classifier', best_gamma1)
print('SVM penalty weight-2nd classifier', best_c2)
print('Gaussian kernel width-2nd classifier', best_gamma2)
print('SVM penalty weight-3rd classifier:', best_c3)
print('Gaussian kernel width-3rd classifier', best_gamma3)

SVM penalty weight-1 classifier 100
Gaussian kernel width-1st classifier 3.1249999999999996
SVM penalty weight-2nd classifier 100
Gaussian kernel width-2nd classifier 1.3888888888888888
SVM penalty weight-3rd classifier: 100
Gaussian kernel width-3rd classifier 2.0


In [None]:
clfr = svm.SVC(C=best_c1, decision_function_shape='ovr', gamma=best_gamma1)
clfr.fit(X_train, y_trncat1)
ypred1 = clfr.predict(X_test)
clfr = svm.SVC(C=best_c2, decision_function_shape='ovr', gamma=best_gamma2)
clfr.fit(X_train, y_trncat2)
ypred2 = clfr.predict(X_test)
clfr = svm.SVC(C=best_c3, decision_function_shape='ovr', gamma=best_gamma3)
clfr.fit(X_train, y_trncat3)
ypred3 = clfr.predict(X_test)
ypred1 = pd.DataFrame(ypred1,columns=[0])
ypred2 = pd.DataFrame(ypred2,columns=[1])
ypred3 = pd.DataFrame(ypred3,columns=[2])
ypred = pd.concat([ypred1, ypred2, ypred3], axis=1)


In [None]:
def hammings(y_tstcat, ypred):
  l = 0
  for i in range(0, len(y_tstcat)):
    lscore = 0
    for j in range(0, 3):
      if y_tstcat.iloc[i,j]!= ypred.iloc[i,j]:
        lscore = lscore+1
      lscore = lscore/3
      l=l+lscore
  l=l/(len(y_tstcat))
  return(l)
def exmatch(y_tstcat, ypred):
  exmatch=0
  for i in range(0, len(y_tstcat)):
    if ypred.iloc[i, 0]==y_tstcat.iloc[i, 0] and ypred.iloc[i, 1]==y_tstcat.iloc[i, 1] and ypred.iloc[i, 2]==y_tstcat.iloc[i, 2]:
      exmatch = exmatch+1
  exmatch = exmatch/(len(y_tstcat))
  return exmatch

print('Hamming loss of svm with gaussian kernel', hammings(y_tstcat, ypred))
print('Exact match of svm with gaussian kernel', exmatch(y_tstcat, ypred))

Hamming loss of svm with gaussian kernel 0.009898272519856581
Exact match of svm with gaussian kernel 0.9888837424733673


**iii. Repeat 1(b)ii with L1-penalized SVMs.3 Remember to normalize the at-
tributes.**

In [None]:
import warnings
warnings.filterwarnings('ignore')
def L1(X_train1,X_train2, X_train3,y_trncat1, y_trncat2, y_trncat3,X_test,y_tstcat):
  bscore1 = 0
  bscore2 = 0
  bscore3 = 0
  acc_score1 = list()
  acc_score2 = list()
  acc_score3 = list()
  for i in range(-3,4):
    c = 10**i
    linclfr = svm.LinearSVC(C=c, penalty='l1', dual=False, multi_class='ovr')
    acc_score1.append(cross_val_score(linclfr, X_train1, y_trncat1, cv=10, scoring='accuracy').mean())
    acc_score2.append(cross_val_score(linclfr, X_train2, y_trncat2, cv=10, scoring='accuracy').mean())
    acc_score3.append(cross_val_score(linclfr, X_train3, y_trncat3, cv=10, scoring='accuracy').mean())
    acc1 = max(acc_score1)
    acc2 = max(acc_score2)
    acc3 = max(acc_score3)
    if acc1>bscore1:
      bscore1 = acc1
      best_c1 = c
    if acc2>bscore2:
      bscore2 = acc2
      best_c2 = c
    if acc3>bscore3:
      bscore3 = acc3
      best_c3 = c
  print('L1-penalized SVM-1st classifier', best_c1)
  print('L1-penalized SVM-2nd classifier', best_c2)
  print('L1-penalized SVM-3rd classifier', best_c3)
  #label1 training
  linclfr1 = svm.LinearSVC(C=best_c1, penalty='l1', dual=False, multi_class='ovr')
  linclfr1.fit(X_train, y_trncat1)
  #label1 prediction
  ypred1 = linclfr1.predict(X_test)
  #label2 training
  linclfr2 = svm.LinearSVC(C=best_c2, penalty='l1', dual=False, multi_class='ovr')
  linclfr2.fit(X_train, y_trncat2)
  #label2 prediction
  ypred2 = linclfr2.predict(X_test)
  #label3 training
  linclfr3 = svm.LinearSVC(C=best_c3, penalty='l1', dual=False, multi_class='ovr')
  linclfr3.fit(X_train, y_trncat3)
  #label3 prediction
  ypred3 = linclfr3.predict(X_test)
 
  ypred1 = pd.DataFrame(ypred1, columns= [0])
  ypred2 = pd.DataFrame(ypred2, columns= [1])
  ypred3 = pd.DataFrame(ypred3, columns= [2])
  ypred = pd.concat([ypred1, ypred2, ypred3], axis=1)
  
#Hamming loss
  print('The Hamming loss for Linear SVC is', hammings(y_tstcat, ypred))
#Exact match
  print('The exact match for Linear SVC is', exmatch(y_tstcat, ypred))
  
L1(X_train,X_train,X_train,y_trncat1,y_trncat2,y_trncat3,X_test,y_tstcat)

L1-penalized SVM-1st classifier 100
L1-penalized SVM-2nd classifier 1000
L1-penalized SVM-3rd classifier 10
The Hamming loss for Linear SVC is 0.0709347606059049
The exact match for Linear SVC is 0.9092172301991662


**iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy
class imbalance. Report your conclusions about the classifers you trained.**

In [None]:
#SMOTE
sm = SMOTE(kind='svm')
X_trnsm1, y_trnsm1 = sm.fit_sample(X_train, y_trncat1)
X_trnsm2, y_trnsm2 = sm.fit_sample(X_train, y_trncat2)
X_trnsm3, y_trnsm3 = sm.fit_sample(X_train, y_trncat3)

L1(X_trnsm1,X_trnsm2,X_trnsm3,y_trnsm1,y_trnsm2,y_trnsm3,X_test,y_tstcat)


SMOTE SVM-1st classifier 100
SMOTE SVM-2nd classifier 100
SMOTE SVM-3rd classifier 10
The Hamming loss for Linear SVC is 0.12540099154272502
The exact match for Linear SVC is 0.8175081056044465


**Conclusions of classifiers trained:**

***For SVM penalty weights: The weights for all three classifiers is 100 ***


***Gamma can be calculated as 1/(2*(sigma^2)) ***

***where sigma is Gaussian kernel width being***

***3.1249999999999996 - 1st classifier***

***1.388 - 2nd classifier***

**2.0 - 3rd classifier ***


**For L1-penalized SVM: The weights are 100,1000 and 10 for 1st,2nd and 3rd classifier respectively.**

**For SVM - SMOTE: The weights are 100,100 and 10 respectively.**

**The exact match scores  are higher for L1-penalized SVM than SMOTE SVM.**

***The Hamming Loss of SVM SMOTE is higher than L1-penalized SVM ***