# Set the Environment

In [None]:
!pip install --upgrade transformers

In [15]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForXVector, WavLMForXVector
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, svm, metrics
from sklearn.metrics import ConfusionMatrixDisplay, f1_score, roc_auc_score
from sklearn.utils import class_weight
from sklearn import tree
import pandas as pd
import torch
import torchaudio
import librosa
from tqdm import tqdm
import glob
import numpy as np
import pickle

import warnings
warnings.filterwarnings("ignore")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# Upload pretrained WavLMforXVectors Model

In [None]:
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("anton-l/wav2vec2-base-superb-sv")
# model = Wav2Vec2ForXVector.from_pretrained("anton-l/wav2vec2-base-superb-sv").cuda()
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv").cuda()

# Embeddings Extraction 

## Loading Audios

In [6]:
path = '/content/drive/MyDrive/mosquito/'
audio_train = glob.glob(path + 'train/*.wav')
audio_dev = glob.glob(path + 'dev/a/*.wav')
print("Audios correctly loaded")

Audios correctly loaded


In [7]:
print(len(audio_train))
print(len(audio_dev))

8196
2165


In [23]:
max_duration = 30.0

## Train Set

In [11]:
df_train = pd.read_csv(path + 'train.csv', index_col=None)

In [27]:
X_train = []
Y_train = []

In [None]:
with torch.no_grad():

  inputs = feature_extractor(
        [librosa.resample(np.asarray(torchaudio.load(audio_train[audio_train.index(path + 'train/' + str(d) + '.wav')])[0]).squeeze(0), 48_000, 16_000) for d in df_train['id'][:1]],
        sampling_rate=16000, 
        return_tensors="pt",             
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True,
        padding='max_length').to(device)
  embeddings_concatenation = model(**inputs).embeddings
  Y_train.append(df_train.loc[0,'label'])

  for i in tqdm(range(1, len(df_train))):
      inputs = feature_extractor(
          [librosa.resample(np.asarray(torchaudio.load(audio_train[audio_train.index(path + 'train/' + str(d) + '.wav')])[0]).squeeze(0), 48_000, 16_000) for d in df_train['id'][i:i+1]],
          sampling_rate=16000, 
          return_tensors="pt",             
          max_length=int(feature_extractor.sampling_rate * max_duration), 
          truncation=True,
          padding='max_length').to(device)
      embeddings = model(**inputs).embeddings
      embeddings_concatenation = torch.cat((embeddings_concatenation, embeddings))
      Y_train.append(df_train.loc[i,'label'])

embeddings_train = torch.nn.functional.normalize(embeddings_concatenation, dim=-1).cpu()
X_train = embeddings_train.numpy()

In [None]:
with open(path + 'X_train_30.txt', 'wb') as fp:
    pickle.dump(X_train, fp)
with open(path + 'Y_train_30.txt', 'wb') as fp:
    pickle.dump(Y_train, fp)

In [9]:
with open (path + 'X_train.txt', 'rb') as fp:
    X_train = pickle.load(fp)
with open (path + 'Y_train.txt', 'rb') as fp:
    Y_train = pickle.load(fp)

In [10]:
print(len(X_train))
print(len(Y_train))

8196
8196


## Dev Set

In [12]:
df_dev = pd.read_csv(path + 'dev.csv', index_col=None)

In [None]:
X_dev = []
Y_dev = []

In [None]:
with torch.no_grad():

  inputs = feature_extractor(
        [librosa.resample(np.asarray(torchaudio.load(audio_dev[audio_dev.index(path + 'dev/a/' + str(d) + '.wav')])[0]).squeeze(0), 48_000, 16_000) for d in df_dev['id'][:1]],
        sampling_rate=16000, 
        return_tensors="pt",             
        max_length=int(feature_extractor.sampling_rate * max_duration), 
        truncation=True,
        padding='max_length').to(device)
  embeddings_concatenation_dev = model(**inputs).embeddings
  Y_dev.append(df_dev.loc[0,'label'])

  for i in tqdm(range(1, len(df_dev))):
      inputs = feature_extractor(
          [librosa.resample(np.asarray(torchaudio.load(audio_dev[audio_dev.index(path + 'dev/a/' + str(d) + '.wav')])[0]).squeeze(0), 48_000, 16_000) for d in df_dev['id'][i:i+1]],
          sampling_rate=16000, 
          return_tensors="pt",             
          max_length=int(feature_extractor.sampling_rate * max_duration), 
          truncation=True,
          padding='max_length').to(device)
      embeddings = model(**inputs).embeddings
      embeddings_concatenation_dev = torch.cat((embeddings_concatenation_dev, embeddings))
      Y_dev.append(df_dev.loc[i,'label'])

embeddings_dev = torch.nn.functional.normalize(embeddings_concatenation_dev, dim=-1).cpu()
X_dev = embeddings_dev.numpy()

In [None]:
with open(path + 'X_dev_30.txt', 'wb') as fp:
    pickle.dump(X_dev, fp)
with open(path + 'Y_dev_30.txt', 'wb') as fp:
    pickle.dump(Y_dev, fp)

In [13]:
with open (path + 'X_dev.txt', 'rb') as fp:
    X_dev = pickle.load(fp)
with open (path + 'Y_dev.txt', 'rb') as fp:
    Y_dev = pickle.load(fp)

# Classifier

## Compute Weights

In [14]:
from sklearn.utils.class_weight import compute_class_weight

labels = df_train['label'].unique()
weights = compute_class_weight('balanced', classes=labels, y=Y_train)
class_weights = {k: v for k, v in zip(labels, weights)}

print('Class weights:', class_weights)

Class weights: {0: 1.6261904761904762, 1: 0.7219873150105708}


## Logistic Regression

### Remove Nan Values

In [19]:
def isNaN(num):
  condition = (num != num)
  if True in condition:
    return True
  else:
    return False

In [20]:
import sys

original_stdout = sys.stdout    # Save a reference to the original standard output
i = 0

with open('/content/train_nan_values.txt', 'w') as f:

  sys.stdout = f                # Change the standard output to the file just created

  for x in X_train:
    if isNaN(x) == True:
      print(i)
    i += 1
    
sys.stdout = original_stdout    # Reset the standard output to its original value

In [21]:
nan_indexes = [40, 93, 491, 4463, 4488, 4489, 4495, 4506, 4714, 4771, 4835, 5031, 5044, 5049,
               5207, 5212, 5219, 5225, 5374, 5399, 5434, 5458, 5505, 5507, 5560, 5564, 5582, 5586,
               5595, 5619, 5626, 5632, 5646, 5656, 5662, 5668, 5673, 5675, 5678, 5686, 5698, 5702,
               5704, 5719, 6667, 7788, 7832]

In [None]:
X_train_1 = []
Y_train_1 = []

for i in range(0,len(X_train)):
  if i not in nan_indexes:
    X_train_1.append(X_train[i])
    Y_train_1.append(Y_train[i])

### Fit LR

In [48]:
LR_mosquito = LogisticRegression(class_weight=class_weights)
LR_mosquito.fit(X_train_1, Y_train_1)
Y_predicted = LR_mosquito.predict(X_dev)

### Compute Scores

In [49]:
print('weighted_F1_score: ' + str(f1_score(Y_dev, Y_predicted, average='weighted')))
print('macro_F1_score: ' + str(f1_score(Y_dev, Y_predicted, average='macro')))
print(
    f"Classification report for classifier {LR_mosquito}:\n"
    f"{metrics.classification_report(Y_dev, Y_predicted)}\n"
)

weighted_F1_score: 0.4700736327832945
macro_F1_score: 0.4580419791429545
Classification report for classifier LogisticRegression(class_weight={0: 1.6261904761904762, 1: 0.7219873150105708}):
              precision    recall  f1-score   support

           0       0.47      0.19      0.27      1012
           1       0.53      0.81      0.64      1153

    accuracy                           0.52      2165
   macro avg       0.50      0.50      0.46      2165
weighted avg       0.50      0.52      0.47      2165




In [50]:
print('roc_auc_score: ' + str(roc_auc_score(Y_dev, LR_mosquito.predict_proba(X_dev)[:, 1])))
# print(roc_auc_score(Y_dev, LR_mosquito.decision_function(X_dev)))

roc_auc_score: 0.4958597437857591


### Save Model

In [None]:
LR_mosquito_filename = path + 'LR_mosquito.sav'
pickle.dump(LR_mosquito, open(LR_mosquito_filename, 'wb'))

## Decision Tree

### Remove Nan Values

In [16]:
def isNaN(num):
  condition = (num != num)
  if True in condition:
    return True
  else:
    return False

In [17]:
import sys

original_stdout = sys.stdout    # Save a reference to the original standard output
i = 0

with open('/content/train_nan_values.txt', 'w') as f:

  sys.stdout = f                # Change the standard output to the file just created

  for x in X_train:
    if isNaN(x) == True:
      print(i)
    i += 1
    
sys.stdout = original_stdout    # Reset the standard output to its original value

In [18]:
nan_indexes = [40, 93, 491, 4463, 4488, 4489, 4495, 4506, 4714, 4771, 4835, 5031, 5044, 5049,
               5207, 5212, 5219, 5225, 5374, 5399, 5434, 5458, 5505, 5507, 5560, 5564, 5582, 5586,
               5595, 5619, 5626, 5632, 5646, 5656, 5662, 5668, 5673, 5675, 5678, 5686, 5698, 5702,
               5704, 5719, 6667, 7788, 7832]

In [19]:
X_train_1 = []
Y_train_1 = []

for i in range(0,len(X_train)):
  if i not in nan_indexes:
    X_train_1.append(X_train[i])
    Y_train_1.append(Y_train[i])

### Fit DT

In [20]:
DT_mosquito = tree.DecisionTreeClassifier()
DT_mosquito = DT_mosquito.fit(X_train_1, Y_train_1)
Y_predicted = DT_mosquito.predict(X_dev)

### Compute Scores

In [21]:
print('weighted_F1_score: ' + str(f1_score(Y_dev, Y_predicted, average='weighted')))
print('macro_F1_score: ' + str(f1_score(Y_dev, Y_predicted, average='macro')))
print(
    f"Classification report for classifier {DT_mosquito}:\n"
    f"{metrics.classification_report(Y_dev, Y_predicted)}\n"
)

weighted_F1_score: 0.44542532954319286
macro_F1_score: 0.4287391464123338
Classification report for classifier DecisionTreeClassifier():
              precision    recall  f1-score   support

           0       0.57      0.10      0.17      1012
           1       0.54      0.93      0.68      1153

    accuracy                           0.54      2165
   macro avg       0.55      0.52      0.43      2165
weighted avg       0.55      0.54      0.45      2165




In [22]:
print('roc_auc_score: ' + str(roc_auc_score(Y_dev, DT_mosquito.predict_proba(X_dev)[:, 1])))
# print(roc_auc_score(Y_dev, LR_mosquito.decision_function(X_dev)))

roc_auc_score: 0.5166308718620269


### Save Model

In [None]:
DT_mosquito_filename = path + 'DT_mosquito.sav'
pickle.dump(DT_mosquito, open(DT_mosquito_filename, 'wb'))