# Import libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Upload csv, extract the text input feature and the target variable y

In [None]:
df = pd.read_csv("/content/drive/MyDrive/BDE/BERT/merged_yt_collection_preprocessedtxt.csv")
text = df["preprocessed_text"].to_numpy()
y = df["moderationStatus"].to_numpy()

# Train and test split

In [None]:
test_size = 0.2
X_train,X_test,Y_train,Y_test = train_test_split(text,y,test_size=test_size,random_state=42,stratify = y)

# Compiling and training of the model

In [None]:
X_train_list = list(X_train)
Y_train_list = list(Y_train)

In [None]:
# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Load and preprocess the data
train_texts = X_train_list   # List of training texts
train_labels = Y_train_list  # List of corresponding training labels (0 or 1)

def preprocess(texts, labels):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    labels = torch.tensor(labels).to(device)
    return inputs, labels

train_inputs, train_labels = preprocess(train_texts, train_labels)
train_data = list(zip(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels))
train_dataloader = DataLoader(train_data, batch_size=16, shuffle=True)

# Fine-tuning parameters
epochs = 3
learning_rate = 2e-5

# Set model to training mode
model.train()

# Initialize optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
for epoch in range(epochs):
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}: Average Loss = {average_loss}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1: Average Loss = 0.48377565257990235
Epoch 2: Average Loss = 0.4317950873941079
Epoch 3: Average Loss = 0.3509556978697538


In [None]:
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/BDE/Models/fine-tuned-bert')

# Load the fine-tuned model
loaded_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/BDE/Models/fine-tuned-bert')

# Evaluating the model

In [None]:
input_inputs, _ = preprocess(list(X_test), [])  # Pass empty labels as we won't use them for inference
input_data = list(zip(input_inputs['input_ids'], input_inputs['attention_mask']))
input_dataloader = DataLoader(input_data, batch_size=16, shuffle=False)

In [None]:
loaded_model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
input_predictions = []
loaded_model = loaded_model.to(device)
with torch.no_grad():
    for batch in input_dataloader:
        input_ids, attention_mask = batch
        inputs = {'input_ids': input_ids.to(device), 'attention_mask': attention_mask.to(device)}

        outputs = loaded_model(**inputs)
        logits = outputs.logits

        predicted_labels = torch.max(logits, dim=1).indices

        input_predictions.extend(predicted_labels.cpu().numpy())

input_predictions = np.array(input_predictions)

In [None]:
input_predictions

array([0, 0, 0, ..., 0, 1, 0])

In [None]:

macro_f1_score = f1_score(Y_test,input_predictions,average="macro")
print(f"Macro f1-score: {macro_f1_score}")

Macro f1-score: 0.6812267310214712


# Save model

# Predict on test set

In [None]:
path_test = "/content/drive/MyDrive/BDE/Filtered_collections/y_test_collection.json"
csv_test_path  = "/content/drive/MyDrive/BDE/Predictions/y_test.csv"

with open(path_test, 'r') as test_file:
    test_json = json.load(test_file)

df_test= pd.read_csv(csv_test_path)

In [None]:
df_test_text = pd.read_csv("/content/drive/MyDrive/BDE/RNN/merged_yt_collection_test_preprocessedtxt.csv")
text_test = df_test_text["preprocessed_text"].to_numpy()

In [None]:
input_inputs, _ = preprocess(list(text_test), [])  # Pass empty labels as we won't use them for inference
input_data = list(zip(input_inputs['input_ids'], input_inputs['attention_mask']))
input_dataloader = DataLoader(input_data, batch_size=16, shuffle=False)

In [None]:
loaded_model.eval()

In [None]:
input_predictions_test = []
loaded_model = loaded_model.to(device)
with torch.no_grad():
    for batch in input_dataloader:
        input_ids, attention_mask = batch
        inputs = {'input_ids': input_ids.to(device), 'attention_mask': attention_mask.to(device)}

        outputs = loaded_model(**inputs)
        logits = outputs.logits

        predicted_labels = torch.max(logits, dim=1).indices

        input_predictions_test.extend(predicted_labels.cpu().numpy())

input_predictions_test = np.array(input_predictions_test)

In [None]:
df_test["moderationStatus"] =  np.where(input_predictions_test == 0, "not moderated", "moderated") 
df_test.to_csv(f"/content/drive/MyDrive/BDE/Predictions/BERT_model_{macro_f1_score:.10f}_y.csv",index=False)