In [1]:
import os
from pathlib import Path
import pandas as pd
import torch
import torchaudio
import torchvision
import numpy as np
from torchvision import models
import torchaudio
import matplotlib.pyplot as plt
from tqdm import tqdm
import timm
import gc
import librosa

In [2]:
tabular = True

In [3]:
data_dir = Path('/kaggle/input/tb-coughs-audio/data')
solicited = data_dir/'solicited'
TRAIN_FOLDER = solicited
cv_train = False # cross-validation or one fold

In [4]:
MODEL_FOLDER = Path('./models')

os.makedirs(MODEL_FOLDER, exist_ok=True)

# Prepare data

In [5]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold, StratifiedGroupKFold
import torchaudio

data_dir = Path('/kaggle/input/tb-tab-data')



In [6]:
def get_sample_length(fn,base_path):
    p = base_path/fn
    x, sr = torchaudio.load(str(p))
    return x.shape[1]

In [7]:
def find_path(filename,directory_path=data_dir):
    l = list(directory_path.glob("**/" + filename))
    if len(l) == 0:
        return None
    elif len(l) == 1:
        return l[0]
    else:
        return l

In [8]:
solicited_df = pd.read_csv(find_path('CODA_TB_Solicited_Meta_Info.csv'))
clinical_df = pd.read_csv(find_path('CODA_TB_Clinical_Meta_Info.csv'))

In [9]:
#add tb_status
df = pd.merge(solicited_df, clinical_df[['participant', 'tb_status']], on = 'participant', how = 'inner')
# reset index so that .loc and .iloc are same for index
df.reset_index(drop=True,inplace=True)
#add orig_id
df['orig_id'] = df.index

n_folds = 5
seed = 2023



skf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=seed)
for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['tb_status'], groups = df["participant"])):
    df.loc[val_idx, 'fold'] = fold
    
display(df.groupby(['fold','tb_status'])['filename'].count())

fold  tb_status
0.0   0            1343
      1             548
1.0   0            1392
      1             517
2.0   0            1352
      1             684
3.0   0            1378
      1             621
4.0   0            1377
      1             560
Name: filename, dtype: int64

In [10]:
df.to_csv("metadata.csv")


## Load the metadata

In [11]:
df = pd.read_csv("metadata.csv",index_col=[0])

In [12]:
df['path'] = "/kaggle/input/tb-coughs-audio/data/solicited/" + df['filename']

In [13]:
df

Unnamed: 0,participant,filename,sound_prediction_score,tb_status,orig_id,fold,path
0,CODA_TB_0001,1645088710003-recording-1.wav,0.990254,0,0,2.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
1,CODA_TB_0001,1645088760390-recording-1.wav,0.990272,0,1,2.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
2,CODA_TB_0001,1645088760830-recording-1.wav,0.990112,0,2,2.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
3,CODA_TB_0001,1645088710843-recording-1.wav,0.990152,0,3,2.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
4,CODA_TB_0001,1645088759950-recording-1.wav,0.990039,0,4,2.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
...,...,...,...,...,...,...,...
9767,CODA_TB_1107,1658214018804-recording-1.wav,0.941761,0,9767,0.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
9768,CODA_TB_1107,1658213992939-recording-1.wav,0.904569,0,9768,0.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
9769,CODA_TB_1107,1658213992139-recording-1.wav,0.934713,0,9769,0.0,/kaggle/input/tb-coughs-audio/data/solicited/1...
9770,CODA_TB_1107,1658213940569-recording-1.wav,0.912813,0,9770,0.0,/kaggle/input/tb-coughs-audio/data/solicited/1...


In [14]:
import pandas as pd
import numpy as np
import json
from datasets import Audio
from datasets import Dataset

from datasets import concatenate_datasets

train_paths = df['path'][df['fold']!=1]
val_paths = df['path'][df['fold']==1]


train_ds = Dataset.from_dict({"audio":train_paths ,"label":df['tb_status'][df['fold']!=1]}).cast_column("audio", Audio(sampling_rate=22050))
val_ds = Dataset.from_dict({"audio":val_paths ,"label":df['tb_status'][df['fold']==1]}).cast_column("audio", Audio(sampling_rate=22050))

In [15]:
train_ds

Dataset({
    features: ['audio', 'label'],
    num_rows: 7863
})

In [16]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large")

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [17]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=22050, truncation=True
    )
    return inputs

In [18]:
train_encoded_ds = train_ds.map(preprocess_function, remove_columns="audio", batched=True)
val_encoded_ds = val_ds.map(preprocess_function, remove_columns="audio", batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


  0%|          | 0/2 [00:00<?, ?ba/s]

In [19]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [20]:
import evaluate
import numpy as np
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score, f1_score, log_loss, precision_score, recall_score

roc_auc = evaluate.load("roc_auc")
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def compute_metrics(eval_pred):
    preds = eval_pred.predictions[:, 1] 
    labels = eval_pred.label_ids
    roc_auc = roc_auc_score(labels, preds)
    return {"AUROC": roc_auc}

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [21]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

model = AutoModelForAudioClassification.from_pretrained("facebook/wav2vec2-large", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-large were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.codevectors', 'project_hid.weight', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_hid.bias', 'project_q.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large and are newly initialized: ['classifier.weight', 'projector.weight', 'classi

In [23]:
training_args = TrainingArguments(
    output_dir="wave2vec_tb",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
#     metric_for_best_model='roc_auc',
    save_total_limit=1,
    # fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded_ds,
    eval_dataset=val_encoded_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Auroc
1,0.5965,0.586242,0.629654
2,0.5776,0.557612,0.633022
3,0.5657,0.545635,0.679669
4,0.5216,0.569156,0.676242
5,0.4833,0.57909,0.670216
6,0.5238,0.560839,0.690779
7,0.5181,0.571556,0.68975
8,0.4685,0.575052,0.690791
9,0.4877,0.592846,0.675115
10,0.4759,0.594624,0.678255


TrainOutput(global_step=1230, training_loss=0.5261216462143068, metrics={'train_runtime': 1898.077, 'train_samples_per_second': 41.426, 'train_steps_per_second': 0.648, 'total_flos': 1.6420318297247974e+18, 'train_loss': 0.5261216462143068, 'epoch': 10.0})

In [27]:
val_encoded_ds

Dataset({
    features: ['label', 'input_values'],
    num_rows: 1909
})

In [31]:
val_paths

18      /kaggle/input/tb-coughs-audio/data/solicited/1...
19      /kaggle/input/tb-coughs-audio/data/solicited/1...
20      /kaggle/input/tb-coughs-audio/data/solicited/1...
21      /kaggle/input/tb-coughs-audio/data/solicited/1...
22      /kaggle/input/tb-coughs-audio/data/solicited/1...
                              ...                        
9703    /kaggle/input/tb-coughs-audio/data/solicited/1...
9704    /kaggle/input/tb-coughs-audio/data/solicited/1...
9705    /kaggle/input/tb-coughs-audio/data/solicited/1...
9706    /kaggle/input/tb-coughs-audio/data/solicited/1...
9707    /kaggle/input/tb-coughs-audio/data/solicited/1...
Name: path, Length: 1909, dtype: object

In [39]:
from transformers import pipeline
classifier = pipeline("audio-classification", model="/kaggle/working/wave2vec_tb/checkpoint-369")
val_pred = []
for x in tqdm(val_paths):
    val_pred.append(classifier(x)[1]['score'])

100%|██████████| 1909/1909 [11:41<00:00,  2.72it/s]


In [56]:
thr = [0.5,0.2,0.4]

In [57]:
print('accuracy_scores:',(accuracy_score(df['tb_status'][df['fold']==1], [1 if x >thr[0] else 0 for x in val_pred])))
print('sensitivity_scores:',(recall_score(df['tb_status'][df['fold']==1], [1 if x >thr[1] else 0 for x in val_pred])))
print('specificity_scores',(precision_score(df['tb_status'][df['fold']==1], [1 if x >thr[2] else 0 for x in val_pred])))

accuracy_scores: 0.7291775798847564
sensitivity_scores: 0.8085106382978723
specificity_scores 0.3892709766162311
