In [1]:
! pip install transformers datasets torch scikit-learn



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# Load CSVs
train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

# Check the structure of the data
print(train.head())
print(val.head())
print(test.head())


                 id                                      feedback_text  toxic  \
0  281d77b7bebc2201  :::Sounds good.  Let me know when you're done ...      0   
1  716aac7bf3c63db1  "\nI say something, but it didn't actually con...      0   
2  57cb318c6edcf10c  "Agustina Barrientos]] \n | Modelo de Piñeiro ...      0   
3  dc3bd70118d91b3a  FYI I enjoy licking strangers scrotal sacks......      1   
4  cf10d41f2997d233  How do you get a site?\nMany penguins have ask...      0   

   abusive  vulgar  menace  offense  bigotry  
0        0       0       0        0        0  
1        0       0       0        0        0  
2        0       0       0        0        0  
3        0       1       0        0        0  
4        0       0       0        0        0  
     id                                      feedback_text lang  toxic
0  1203  İyi tamam olabilir. Balkanlar maddesini gelişt...   tr      0
1  5871  Por dios, y la canción de John Lennon: http://...   es      1
2  3590  Selam. Ön

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required resources
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization
    return ' '.join(words)

# Apply preprocessing
train['clean_text'] = train['feedback_text'].apply(preprocess_text)
val['clean_text'] = val['feedback_text'].apply(preprocess_text)
test['clean_text'] = test['content'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the datasets
train_encodings = tokenizer(train['clean_text'].tolist(), padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val['clean_text'].tolist(), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test['clean_text'].tolist(), padding=True, truncation=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# Convert tokenized data into datasets
train_dataset = Dataset.from_dict(train_encodings)
val_dataset = Dataset.from_dict(val_encodings)
test_dataset = Dataset.from_dict(test_encodings)

# Add the labels (target variable)
train_dataset = train_dataset.add_column('labels', train['toxic'].tolist())
val_dataset = val_dataset.add_column('labels', val['toxic'].tolist())


In [6]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory for checkpoints
    num_train_epochs=3,              # Number of epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Warmup steps for learning rate
    weight_decay=0.01,               # Weight decay for regularization
    logging_dir='./logs',            # Directory for logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after every epoch
    save_strategy="epoch"           # Save checkpoint after each epoch
)




In [8]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)  # Get the predicted labels
    labels = p.label_ids  # Get the true labels
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [9]:
trainer = Trainer(
    model=model,                         # Pre-trained BERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training data
    eval_dataset=val_dataset,            # Validation data
    compute_metrics=compute_metrics      # Metrics for evaluation
)


In [10]:
trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmeraj241-25-013[0m ([33mmeraj241-25-013-s[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1819,0.632411,0.815476,0.216216,0.059701,0.093567
2,0.1798,0.897518,0.790476,0.181818,0.089552,0.12
3,0.2504,0.845746,0.840476,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=8805, training_loss=0.21714241282463345, metrics={'train_runtime': 6500.5494, 'train_samples_per_second': 10.833, 'train_steps_per_second': 1.355, 'total_flos': 1.852801740739584e+16, 'train_loss': 0.21714241282463345, 'epoch': 3.0})

In [11]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate(val_dataset)
print("Validation Results:", eval_results)

# Evaluate the model on the test set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Results: {'eval_loss': 0.8457461595535278, 'eval_accuracy': 0.8404761904761905, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 26.1074, 'eval_samples_per_second': 32.175, 'eval_steps_per_second': 4.022, 'epoch': 3.0}
Test Results: {'eval_runtime': 184.8827, 'eval_samples_per_second': 36.239, 'eval_steps_per_second': 4.533, 'epoch': 3.0}


In [12]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./bert_toxic_classifier')
tokenizer.save_pretrained('./bert_toxic_classifier')


('./bert_toxic_classifier/tokenizer_config.json',
 './bert_toxic_classifier/special_tokens_map.json',
 './bert_toxic_classifier/vocab.txt',
 './bert_toxic_classifier/added_tokens.json')

In [13]:
# Predict on new data (e.g., test dataset)
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)

# Display the predicted labels
print(pred_labels)


[0 0 0 ... 0 0 0]
