In [35]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Colab Setup

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import os

ROOT_PATH = "/content/drive/MyDrive/Deep_Learning/text_classification/"
print(os.listdir(ROOT_PATH)) # Check the content of the path
os.chdir(ROOT_PATH) # cd into directory
print(os.listdir(".")) # Check the content of current folder

['client_oauth.json', 'image.png', 'README.md', '.gitattributes', 'ast_run_finetuning.sh', '.gitignore', 'requirements.txt', 'base_model.md', 'preprocessing.md', 'logistic_regression.py', 'logistic_regression_weights.pkl', 'representation_learner.py', 'dataset_prep.py', 'GenerateDataset.ipynb', 'text_classification.py', 'text_classification.ipynb', 'data', 'models', '__pycache__']
['client_oauth.json', 'image.png', 'README.md', '.gitattributes', 'ast_run_finetuning.sh', '.gitignore', 'requirements.txt', 'base_model.md', 'preprocessing.md', 'logistic_regression.py', 'logistic_regression_weights.pkl', 'representation_learner.py', 'dataset_prep.py', 'GenerateDataset.ipynb', 'text_classification.py', 'text_classification.ipynb', 'data', 'models', '__pycache__']


In [36]:
requirements = ROOT_PATH + "/requirements.txt"
%pip install -r {requirements}

Collecting numpy~=1.26.4 (from -r /content/drive/MyDrive/Deep_Learning/text_classification//requirements.txt (line 2))
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting datasets==2.3.2 (from -r /content/drive/MyDrive/Deep_Learning/text_classification//requirements.txt (line 3))
  Using cached datasets-2.3.2-py3-none-any.whl (362 kB)
Collecting gTTS~=2.5.1 (from -r /content/drive/MyDrive/Deep_Learning/text_classification//requirements.txt (line 4))
  Using cached gTTS-2.5.1-py3-none-any.whl (29 kB)
Collecting pandas~=2.2.2 (from -r /content/drive/MyDrive/Deep_Learning/text_classification//requirements.txt (line 5))
  Using cached pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
Collecting scikit-learn~=1.4.2 (from -r /content/drive/MyDrive/Deep_Learning/text_classification//requirements.txt (line 7))
  Using cached scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1

In [37]:
!pip install evaluate
!pip install datasets
!pip install accelerate -U



**Here, restart the kernel and don't execute the above cells that contain `pip ...` anymore after restart.**

# Import dependencies

In [38]:
import torch
from sklearn.model_selection import KFold
from data.fetch_dataset_from_hf import fetch_dataset_from_huggingface
from huggingface_hub import login
from representation_learner import create_label_id_dicts, preprocess_function, compute_metrics
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load dataset

In [7]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [39]:
dataset = fetch_dataset_from_huggingface()
print(dataset)



DatasetDict({
    train: Dataset({
        features: ['audio_waveform', 'hate_speech_score', 'label', 'text'],
        num_rows: 12392
    })
    test: Dataset({
        features: ['audio_waveform', 'hate_speech_score', 'label', 'text'],
        num_rows: 3099
    })
})


# Display data format

In [40]:
print(dataset["train"][0])
print(dataset["train"][2])
waveform_0 = dataset["train"][0]["audio_waveform"]
waveform_0

{'audio_waveform': {'path': 'padded_1521.mp3', 'array': array([-1.96048524e-04, -1.23826787e-04, -2.09705904e-05, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'hate_speech_score': 0.5, 'label': 0, 'text': 'i would, then fucking wreck that chick.'}
{'audio_waveform': {'path': 'padded_13496.mp3', 'array': array([-1.55226910e-04,  2.93002813e-06,  2.67900177e-05, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'hate_speech_score': -0.4, 'label': 1, 'text': 'please send us a video of you fucking a dildo or your daddy... but i also love to see a pussy drooling bad'}


{'path': 'padded_1521.mp3',
 'array': array([-1.96048524e-04, -1.23826787e-04, -2.09705904e-05, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
 'sampling_rate': 16000}

In [41]:
dataset["train"].features

{'audio_waveform': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'hate_speech_score': Value(dtype='float64', id=None),
 'label': ClassLabel(names=['hate', 'non-hate'], id=None),
 'text': Value(dtype='string', id=None)}

In [42]:
dataset["train"].features["label"].names

['hate', 'non-hate']

In [43]:
print(dataset["train"][0])
print(dataset["train"][1])
print(dataset["train"][2])
print(dataset["train"][3])
print(dataset["train"][4])
print(dataset["train"][5])

{'audio_waveform': {'path': 'padded_1521.mp3', 'array': array([-1.96048524e-04, -1.23826787e-04, -2.09705904e-05, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'hate_speech_score': 0.5, 'label': 0, 'text': 'i would, then fucking wreck that chick.'}
{'audio_waveform': {'path': 'padded_13972.mp3', 'array': array([-0.00026199, -0.00019382, -0.00023434, ...,  0.        ,
        0.        ,  0.        ]), 'sampling_rate': 16000}, 'hate_speech_score': 1.13, 'label': 0, 'text': "it's called defending yourself you fucking dumbass. don't you mean this bitch dumbass. yea fuck you"}
{'audio_waveform': {'path': 'padded_13496.mp3', 'array': array([-1.55226910e-04,  2.93002813e-06,  2.67900177e-05, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'hate_speech_score': -0.4, 'label': 1, 'text': 'please send us a video of you fucking a dildo or your daddy... but i also love to see a pussy drooling bad'}
{'audio_wavefo

# Generate meaningful representations

In [44]:
dataset = dataset.remove_columns(["hate_speech_score", "audio_waveform"]) #removes unused columns

In [45]:
label2id, id2label = create_label_id_dicts(dataset)
print(label2id)
print(id2label)

{'hate': '0', 'non-hate': '1'}
{'0': 'hate', '1': 'non-hate'}


In [46]:
print(dataset["test"][0])
print(dataset["test"][1])
print(dataset["test"][2])
print(dataset["test"][3])
print(dataset["test"][4])
print(dataset["test"][5])

{'label': 1, 'text': 'iran should learn white man speaks with forked tongue. whites have never afforded nonwhites their rights to their resources or their own land. typical is palestine they internationalise everything so as to obfuscate and eventually rob natives of whats rightfully theirs. all countries should have the right to arm themselves with nukes. if it good for the us and its criminal allies then its good for them too. fuck..okay us and its gangster eu allies'}
{'label': 1, 'text': "ma'am i don't mean to come on too strong but i would let you wrap those fat pussy lips around my head and strangle me to death"}
{'label': 1, 'text': 'spend in the path of allah in times of ease and times of difficulty, as what you spend and leave with allah is what truly remains...'}
{'label': 1, 'text': 'what a joke. no one has given a thought to a syrian who needs the grace 1 oil cargo to maybe take his sick child to hospital, after a devastating war that these murderous western shitbags conduc

In [47]:
feature_extractor = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("feature_extractor downloaded...")
encoded_dataset = dataset.map(lambda batch: preprocess_function(batch, feature_extractor), batched=True)



feature_extractor downloaded...


Map:   0%|          | 0/3099 [00:00<?, ? examples/s]

In [48]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=feature_extractor)

# Finetune model

The Wav2Vec2ForSequenceClassification is essentially `context representation extractor + classifier` whereas the basic Wav2Vec2 does not have any classifier on top. So, the idea for now is to use the former model and finetune it on our hatespeech dataset. The finetuning uses the text too.

In [49]:
num_labels = len(label2id)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels, label2id=label2id, id2label=id2label,
    #ignore_mismatched_sizes=True
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
print(encoded_dataset)
train_dataset = encoded_dataset["train"].train_test_split(test_size=0.2)
print(train_dataset)

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 12392
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3099
    })
})
DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 9913
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2479
    })
})


In [51]:
#model.train()

training_args = TrainingArguments(
    #use_cpu=True,
    do_train=True,
    output_dir="models/finetuned_text_class",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_only_model=True,  # only saves model to checkpoints
    learning_rate=8e-5, #3e-5
    weight_decay=0.005,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_dir="tensorboard",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    seed=42,

    hub_private_repo=True,
    hub_model_id="DL-Project/DL_Audio_Hatespeech_text_classification_trainer_push",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset["train"],
    eval_dataset=train_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [52]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1
0,0.4863,0.467829,0.770069,0.742092,0.784062,0.7625
2,0.2792,0.528512,0.784994,0.729116,0.818761,0.771343
4,0.0945,0.822989,0.780557,0.755069,0.793691,0.773899
6,0.0331,1.110664,0.775313,0.785888,0.767829,0.776754
8,0.0106,1.274126,0.77854,0.761557,0.786432,0.773795
9,0.0051,1.296423,0.775313,0.785077,0.768254,0.776574


TrainOutput(global_step=770, training_loss=0.1612584430917904, metrics={'train_runtime': 917.9438, 'train_samples_per_second': 107.991, 'train_steps_per_second': 0.839, 'total_flos': 2658729614414268.0, 'train_loss': 0.1612584430917904, 'epoch': 9.935483870967742})

In [53]:
print(trainer.evaluate())

{'eval_loss': 0.9977290630340576, 'eval_accuracy': 0.7736990722065349, 'eval_recall': 0.8118410381184104, 'eval_precision': 0.7526315789473684, 'eval_f1': 0.7811158798283262, 'eval_runtime': 9.8967, 'eval_samples_per_second': 250.487, 'eval_steps_per_second': 7.881, 'epoch': 9.935483870967742}


In [54]:
print(trainer.evaluate(encoded_dataset["test"]))

{'eval_loss': 1.063951015472412, 'eval_accuracy': 0.7544369151339142, 'eval_recall': 0.7929936305732485, 'eval_precision': 0.7406305770374777, 'eval_f1': 0.7659181790218396, 'eval_runtime': 12.1375, 'eval_samples_per_second': 255.325, 'eval_steps_per_second': 7.992, 'epoch': 9.935483870967742}


In [55]:
trainer.push_to_hub(commit_message="text finetuning on full dataset with validation set")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/DL-Project/DL_Audio_Hatespeech_text_classification_trainer_push/commit/da72d03ba72d1c27a49b75bfe32daaaef9e0567a', commit_message='text finetuning on full dataset with validation set', commit_description='', oid='da72d03ba72d1c27a49b75bfe32daaaef9e0567a', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
from models.upload_model_to_hf import upload_model_to_huggingface

upload_model_to_huggingface(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
#evaluate on the test dataset:
def evaluate_model(model, encoded_dataset, feature_extractor):
    trainer = Trainer(
        model=model,
        eval_dataset=encoded_dataset["test"],
        tokenizer=feature_extractor,
        compute_metrics=compute_metrics,
    )
    evaluation_results = trainer.evaluate()
    return evaluation_results, trainer


# After the model has been finetuned
evaluation_results, trainer_for_test_set_eval = evaluate_model(model_finetuned, encoded_dataset, feature_extractor)
print("Evaluation results on test set:", evaluation_results)

# Upload Model to Huggingface

In [None]:
import os

# Define the directory where the checkpoints are saved
checkpoint_directory = "models/finetuned_text_class"
checkpoints = [os.path.join(checkpoint_directory, name) for name in os.listdir(checkpoint_directory)]
checkpoints = [ckpt for ckpt in checkpoints if os.path.isdir(ckpt)]

# get latest checkpoint:
checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
latest_checkpoint = checkpoints[-1]

# The last checkpoint will be the latest one
checkpoint_directory = checkpoints[-1]
print(checkpoint_directory)

models/finetuned_ast\checkpoint-75


In [None]:
# Load the latest checkpoint's model
model = AutoModelForAudioClassification.from_pretrained(checkpoint_directory)

NameError: name 'AutoModelForAudioClassification' is not defined

In [None]:
print(trainer.evaluate())

{'eval_loss': 0.4628932476043701, 'eval_accuracy': 0.7782258064516129, 'eval_recall': 0.8134328358208955, 'eval_precision': 0.7841726618705036, 'eval_f1': 0.7985347985347986, 'eval_runtime': 26.2738, 'eval_samples_per_second': 9.439, 'eval_steps_per_second': 0.609, 'epoch': 9.68}


In [None]:
from models.upload_model_to_hf import upload_model_to_huggingface

upload_model_to_huggingface(model)

README.md:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Model Instantiation
[https://huggingface.co/learn/audio-course/chapter3/classification](https://huggingface.co/learn/audio-course/chapter3/classification)

[Paper: AST: Audio Spectrogram Transformer](https://arxiv.org/pdf/2104.01778.pdf)