In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Imports

In [None]:
!pip install transformers


import os
import numpy as np
import pandas as pd
import logging
import nltk
import json
import regex
import math
import torch
import random


from nltk.corpus import stopwords
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers.models.bert.tokenization_bert import BasicTokenizer
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertForSequenceClassification

nltk.download('stopwords')



def load_file(path):
  file = open(path)
  lines = file.readlines()
  json_arr = [json.loads(x) for x in lines]
  return json_arr




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
OUTPUT_DIR = "/tmp/"
BASE_DIR = "drive/MyDrive/GenAIContentDetection/"

english = {
  "bert_model": "bert-base-cased",
  "path": BASE_DIR + "Datasets/English/academic_essay_english_train.jsonl",
  "test_path": BASE_DIR + "Datasets/English/academic_essay_english_dev.jsonl",
  "unlabeled": BASE_DIR + "Datasets/English/academic_essay_english_dev_test_no_label.jsonl"
}

arabic = {
  "bert_model": "bert-base-multilingual-cased",
  "path": BASE_DIR + "Datasets/Arabic/academic_essay_arabic_train.jsonl",
  "test_path": BASE_DIR + "Datasets/Arabic/academic_essay_arabic_dev.jsonl",
  "unlabeled": BASE_DIR + "Datasets/Arabic/academic_essay_arabic_dev_test_no_label.jsonl"
}

daigt = {
  "bert_model": "bert-base-cased",
  "path": BASE_DIR + "Datasets/DAIGT/daigt_essay_train.jsonl",
  "test_path": BASE_DIR + "Datasets/DAIGT/daigt_essay_dev.jsonl",
}

datasets = {
    "english": english,
    "arabic": arabic,
    "daigt": daigt
}

Additional data

In [None]:
import json

def average_essay_length(json_files):
    total_word_count = 0
    essay_count = 0

    for file_path in json_files:
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                essay = data.get("essay", "")  # Get the "essay" value, or "" if not found
                words = essay.split()
                total_word_count += len(words)
                essay_count += 1

    if essay_count == 0:
        return 0  # Avoid division by zero if no essays are found

    average_length = total_word_count / essay_count
    return average_length

file_paths = [datasets["english"]["path"]]  # Replace with your file paths
average_length = average_essay_length(file_paths)
print(f"Average essay length (in words): {average_length}")

Average essay length (in words): 288.92891221374043


In [None]:
def chat_llm(prompt):
  genai.configure(api_key="YOUR-KEY-HERE")


  model = genai.GenerativeModel(model_name="gemini-1.5-flash")

  result = model.generate_content(prompt)
  return result.text.split("$$$$$")


def enlarge(input_files=[], num_essays=10, essay_length=300, num_sentences=100, num_paraphrase=1):
    data = []
    for file in input_files:
        data.extend(load_file(file))

    generated_essays = []
    selected_sentences = []
    while len(selected_sentences) < num_sentences:
        sample_essay = random.choice(data)["essay"]
        sentences = sample_essay.split(".")
        selected_sentence = random.choice(sentences).strip()
        if selected_sentence and selected_sentence not in selected_sentences:
            selected_sentences.append(selected_sentence)
    print(selected_sentences)
    prompt = f"""
    # System
    Write {num_essays} academic essays no title is required and the essay is expected to be one continuous block of text, each approximately {essay_length} words long.
    Write the essays to incorporate the following sentences into the various parts of the essay, you are only allowed to use the given sentence at max {num_paraphrase} in the entire collection of resulting essay, if this is greater one ensure that the exact same sentence is not being used rather the sentence is modified while maintaining its semantic meaning:
    {" ".join(selected_sentences)}
    Each essay should be distinct, and the sentences should be used in different combinations and contexts in each essay.
    The essays are to be returned as a list of string seperated by $$$$$
    # Response
    """
    new_essays = chat_llm(prompt)
    return new_essays

#### Preprocess

In [None]:
#Not used
#def preprocess(text, language="english"):
#  stop_words = "|".join(regex.sub(r"\'", '', word) for word in stopwords.words(language))
#  text = text.lower()
#  text = regex.sub(r"[?!_:\.\,\-\'\"\)\(\/\*@\n0-9]", " ", text)
#  text = regex.sub(r"\b(" + stop_words + r")\b", "", text)
#  text = regex.sub(r" +", " ", text)
#  return text

def create_dataframe(input):
  output = []
  for line in input:
    output.append(line)
  return pd.DataFrame.from_dict(output)

def create_dataframe_from_file(path, max_size=1000):
  lines = load_file(path)
  df = create_dataframe(lines)
  if len(lines) > max_size:
    df = df.sample(n=max_size)
  return df


def load_dfs(language, ):
  df = create_dataframe_from_file(datasets[language]["path"], max_size=10000)
  test_df = create_dataframe_from_file(datasets[language]["test_path"], max_size=5000)
  dev_df = test_df
  return df, test_df, dev_df

#df, test_df, dev_df = load_dfs("daigt")
#print(dev_df)

#### Naive Bayes

In [None]:
def vocab_dictionary(df):
  vocab_dict={}
  for _, row in df.iterrows():
    words = filter(lambda w: w != '', row["essay"].split(" "))
    for word in words:
      vocab_dict[word] = vocab_dict.get(word, 0) + 1
  return vocab_dict

class AIDetector:
  def train(self, df, smoothing_factor=0.01):
    self.train_df = df
    self.train_vocab = vocab_dictionary(df)
    human_vocab = vocab_dictionary(df[df['label']=="human"])
    ai_vocab = vocab_dictionary(df[df['label']=="ai"])
    self.vocab_sorted = sorted(self.train_vocab.items(), key=lambda x: x[1], reverse=True)
    self.human_prior = len(df[df['label']=="human"].index) / len(df.index) #calculate positive prior
    self.ai_prior = len(df[df['label']=="ai"].index) / len(df.index) #calculate negative prior
    self.likelihood = {}
    number_instances_human = sum(human_vocab.values())
    number_instances_ai = sum(ai_vocab.values())
    number_types = len(self.train_vocab)
    for word, count in self.train_vocab.items():
      human_likelihood = (human_vocab.get(word, 0) + smoothing_factor ) / (number_instances_human + (smoothing_factor * number_types))
      ai_likelihood = (ai_vocab.get(word, 0) + smoothing_factor ) / (number_instances_ai + (smoothing_factor * number_types))
      self.likelihood[word] = {"human": math.log(human_likelihood), "ai": math.log(ai_likelihood)}


  def classify_essay(self, essay):
    tokens = list(filter(lambda x: x in self.likelihood, essay.split(" ")))
    log_score_human = math.log(self.human_prior) + sum([self.likelihood[word]["human"] for word in tokens])
    log_score_ai = math.log(self.ai_prior) + sum([self.likelihood[word]["ai"] for word in tokens])
    predicted_author = "human" if log_score_human >=  log_score_ai else "ai"
    return predicted_author, {'human': log_score_human, 'ai': log_score_ai}

  def score_test(self, test_df):
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0

    for index,review in test_df.iterrows():
      true_label = review['label']
      text = review['essay']
      predicted_author, sentiment_scores = self.classify_essay(text)
      if true_label == "human" and predicted_author == "human":
        true_negative =  true_negative + 1
      elif true_label == "ai" and predicted_author == "ai":
        true_positive = true_positive + 1
      elif true_label == "human" and predicted_author == "ai":
        false_negative = false_negative + 1
      elif true_label == "ai" and predicted_author == "human":
        false_positive = false_positive + 1

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1_score = (2 * precision * recall) / (precision + recall)
    return {
        "true_negative": true_negative,
        "false_negative": false_negative,
        "true_positive": true_positive,
        "false_positive": false_positive,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
      }

def evaluate_bayes(opts):
  df, test_df, dev_df = load_dfs(opts["language"])
  bayes_model = AIDetector()
  bayes_model.train(df)
  return bayes_model.score_test(test_df)

#### Data Loading

In [None]:
# !pip install transformers==3.5
# labels
label2idx = {'human': 0, 'ai': 1}
print(label2idx)
#
model_path = "drive/MyDrive/GenAIContentDetection/model.pth"
model_saved = False #os.path.exists(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

{'human': 0, 'ai': 1}


In [None]:
#logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
#                    datefmt = '%m/%d/%Y %H:%M:%S',
#                    level = logging.INFO)
#logger = logging.getLogger(__name__)

MAX_SEQ_LENGTH=100

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

def convert_examples_to_inputs(example_texts, example_labels, label2idx, max_seq_length, tokenizer, verbose=0):
      """Loads a data file into a list of `InputBatch`s."""
      input_items = []
      examples = zip(example_texts, example_labels)
      for (ex_index, (text, label)) in enumerate(examples):
          # Create a list of token ids
          input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
          if len(input_ids) > max_seq_length:
              input_ids = input_ids[:max_seq_length]
          # All our tokens are in the first input segment (id 0).
          segment_ids = [0] * len(input_ids)
          # The mask has 1 for real tokens and 0 for padding tokens. Only real
          # tokens are attended to.
          input_mask = [1] * len(input_ids)
          # Zero-pad up to the sequence length.
          padding = [0] * (max_seq_length - len(input_ids))
          input_ids += padding
          input_mask += padding
          segment_ids += padding
          assert len(input_ids) == max_seq_length
          assert len(input_mask) == max_seq_length
          assert len(segment_ids) == max_seq_length
          label_id = label2idx[label]
          input_items.append(
              BertInputItem(text=text,
                            input_ids=input_ids,
                            input_mask=input_mask,
                            segment_ids=segment_ids,
                            label_id=label_id))
      return input_items

def get_features(opts, tokenizer):
  df, test_df, dev_df = load_dfs(opts["language"])
  features= convert_examples_to_inputs(df['essay'].tolist(), df['label'], label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
  dev_features= convert_examples_to_inputs(dev_df['essay'].tolist(), dev_df['label'], label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
  test_features= convert_examples_to_inputs(test_df['essay'].tolist(), test_df['label'], label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
  return features, dev_features, test_features

def get_features_from_file(path, tokenizer):
    df = create_dataframe_from_file(path)
    features = convert_examples_to_inputs(df['essay'].tolist(), df['label'], label2idx, MAX_SEQ_LENGTH, tokenizer, verbose=0)
    return features

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True, max_length=1000):

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

def get_data_loader_from_file(path, tokenizer, max_seq_length, batch_size, shuffle=True):
   features = get_features_from_file(path, tokenizer)
   dataloader = get_data_loader(features, max_seq_length, batch_size, shuffle=True)
   return dataloader

def get_data_loaders(opts, tokenizer):
  features, dev_features, test_features = get_features(opts, tokenizer)
  train_dataloader = get_data_loader(features, MAX_SEQ_LENGTH, opts["batch_size"], shuffle=True)
  test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, opts["batch_size"], shuffle=True)
  dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, opts["batch_size"], shuffle=True)
  return train_dataloader, test_dataloader, dev_dataloader


#### Bert

In [None]:
def evaluate_bert(model, dataloader):
    model.eval()
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids)
        output_labels = np.argmax(outputs.logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        predicted_labels += list(output_labels)
        correct_labels += list(label_ids)

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
    true_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 1))
    true_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 0))
    false_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 0))
    false_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 1))
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1_score = (2 * precision * recall) / (precision + recall)
    return {
        "true_positive": true_positive,
        "true_negative": true_negative,
        "false_positive": false_positive,
        "false_negative": false_negative,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

In [None]:
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support


def train_bert(opts, train_dataloader):
  bert_model = BertForSequenceClassification.from_pretrained(datasets[opts["language"]]["bert_model"], num_labels = len(label2idx))
  bert_model.to(device)
  #
  LEARNING_RATE = 5e-5
  MAX_GRAD_NORM = 1
  #
  param_optimizer = list(bert_model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer = torch.optim.AdamW(bert_model.parameters(), lr=LEARNING_RATE)

  loss_history = []
  no_improvement = 0
  for _ in trange(int(opts["epochs"]), desc="Epoch"):
      bert_model.train()
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
          batch = tuple(t.to(device) for t in batch)
          input_ids, input_mask, segment_ids, label_ids = batch

          outputs = bert_model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
          loss = outputs[0]
          loss.backward()
          tr_loss += loss.item()

          torch.nn.utils.clip_grad_norm_(bert_model.parameters(), MAX_GRAD_NORM)

          optimizer.step()
          optimizer.zero_grad()
              #scheduler.step()

      print(tr_loss)
  torch.save(bert_model.state_dict(), BASE_DIR + "model_ffnn_" + opts["language"] +  ".bin")
  return bert_model

#### CNN

In [None]:
class BertCNNClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        self.cnn = torch.nn.Conv1d(768, 10, 3, )
        self.pool = torch.nn.MaxPool1d(3)
        self.dropout = torch.nn.Dropout(0.2)
        self.fc = torch.nn.Linear(320, self.num_labels)

        self.sigmoid = torch.nn.Sigmoid()


        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        token_type_ids = None,
        labels = None,

    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )

        #pooled_output = outputs[1]
        #pooled_output = self.dropout(pooled_output)
        x = outputs.last_hidden_state
        #print(x.shape)
        x = x.permute(0, 2, 1)
        #print(x.shape)
        x = self.cnn(x)
        x = self.pool(x)
        #print(pooled.shape)
        #print(x.shape)
        x = torch.flatten(x, start_dim=1)
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)


        loss = None
        if labels is not None:
          loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
          loss = loss_fct(x, labels)
          #print(x)
          #print(np.argmax(x.to('cpu').detach(), axis=1))

        return [loss, x]



In [None]:
def evaluate_cnn(model, dataloader):
    model.eval()
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids)
        print(outputs)
        output_labels = np.argmax(outputs[1].to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        predicted_labels += list(output_labels)
        correct_labels += list(label_ids)

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
    true_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 1))
    true_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 0))
    false_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 0))
    false_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 1))
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1_score = (2 * precision * recall) / (precision + recall)
    return {
        "true_positive": true_positive,
        "true_negative": true_negative,
        "false_positive": false_positive,
        "false_negative": false_negative,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }

In [None]:
def train_cnn(opts, train_dataloader):
  cnn_model = BertCNNClassification.from_pretrained(datasets[opts["language"]]["bert_model"], num_labels = len(label2idx))
  torch.nn.init.uniform_(cnn_model.cnn.weight, -1, 1)
  torch.nn.init.uniform_(cnn_model.cnn.bias, -1, 1)
  torch.nn.init.uniform_(cnn_model.fc.weight, -1, 1)
  torch.nn.init.uniform_(cnn_model.fc.bias, -1, 1)
  cnn_model.to(device)
  #
  LEARNING_RATE = 5e-8
  WARMUP_PROPORTION = 0.1
  MAX_GRAD_NORM = 1
  #
  optimizer = torch.optim.Adam(cnn_model.parameters(), lr=LEARNING_RATE)

  loss_history = []
  no_improvement = 0
  for _ in trange(int(opts["epochs"]),bert_model desc="Epoch"):
    cnn_model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      outputs = cnn_model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
      loss = outputs[0]
      loss.backward()
      torch.nn.utils.clip_grad_norm_(cnn_model.parameters(), MAX_GRAD_NORM)
      tr_loss += loss.item()

      optimizer.step()
      optimizer.zero_grad()
    print(tr_loss)
  torch.save(cnn_model.state_dict(), BASE_DIR + "model_cnn_" + opts["language"] +  ".bin")
  return cnn_model




#### RNN

In [None]:
class BertRNNClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.layers = 1
        self.embed_dim = 768
        self.hidden_size = 5
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(0.2)
        self.rnn = torch.nn.LSTM(self.embed_dim, self.hidden_size, self.layers)
        self.fc = torch.nn.Linear(self.hidden_size, self.num_labels)
        self.sigmoid = torch.nn.Sigmoid()


        # Initialize weights and apply final processing
        self.post_init()

    def forward(
        self,
        input_ids = None,
        attention_mask = None,
        token_type_ids = None,
        labels = None,

    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )
        x = outputs.last_hidden_state
        h0 = torch.zeros(self.layers, x.size(1), self.hidden_size).to(device)
        c0 = torch.zeros(self.layers, x.size(1), self.hidden_size).to(device)
        x = self.dropout(x)
        x = self.rnn(x, (h0, c0))[0]
        x = self.fc(x[:, -1, :])
        x = self.sigmoid(x)
        loss = None
        if labels is not None:
          loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
          loss = loss_fct(x, labels)

        return [loss, x]




In [None]:
def train_rnn(opts, train_dataloader):
  rnn_model = BertRNNClassification.from_pretrained(datasets[opts["language"]]["bert_model"], num_labels = len(label2idx))
  rnn_model.to(device)
  GRADIENT_ACCUMULATION_STEPS = 1
  LEARNING_RATE = 5e-8
  MAX_GRAD_NORM = 1
  #
  optimizer = torch.optim.AdamW(rnn_model.parameters(), lr=LEARNING_RATE)

  loss_history = []
  no_improvement = 0
  for _ in trange(int(opts["epochs"]), desc="Epoch"):
    rnn_model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, segment_ids, label_ids = batch

      outputs = rnn_model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
      loss = outputs[0]

      loss.backward()
      torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), MAX_GRAD_NORM)
      tr_loss += loss.item()


      optimizer.step()
      optimizer.zero_grad()
    print(tr_loss)
  torch.save(rnn_model.state_dict(), BASE_DIR + "model_rnn_" + opts["language"] +  ".bin")
  return rnn_model

In [None]:
def evaluate_rnn(model, dataloader):
    model.eval()
    predicted_labels, correct_labels = [], []

    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=input_mask,
                                          token_type_ids=segment_ids, labels=label_ids)
        output_labels = np.argmax(outputs[1].to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        predicted_labels += list(output_labels)
        correct_labels += list(label_ids)

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
    true_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 1))
    true_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 0))
    false_positive = np.sum(np.logical_and(predicted_labels == 1, correct_labels == 0))
    false_negative = np.sum(np.logical_and(predicted_labels == 0, correct_labels == 1))
    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)
    f1_score = (2 * precision * recall) / (precision + recall)
    return {
        "true_positive": true_positive,
        "true_negative": true_negative,
        "false_positive": false_positive,
        "false_negative": false_negative,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }


In [None]:
opts = {
    "language": "english",
    "batch_size": 16,
    "epochs": 15,
    "model": "rnn"
}
tokenizer = BertTokenizer.from_pretrained(datasets[opts["language"]]["bert_model"])
model = opts["model"]

train_dataloader, test_dataloader, dev_dataloader = get_data_loaders(opts, tokenizer)
scores = None
if model == "bayes":
  scores = evaluate_bayes(opts)
elif model == "bert":
  bert_model = train_bert(opts, train_dataloader)
  scores = evaluate_bert(bert_model, dev_dataloader)
elif model == "cnn":
  cnn_model = train_cnn(opts, train_dataloader)
  scores = evaluate_cnn(cnn_model, dev_dataloader)
elif model == "rnn":
  rnn_model = train_rnn(opts, train_dataloader)
  scores = evaluate_rnn(rnn_model, dev_dataloader)
print(scores)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
Some weights of BertRNNClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['fc.bias', 'fc.weight', 'rnn.bias_hh_l0', 'rnn.bias_ih_l0', 'rnn.weight_hh_l0', 'rnn.weight_ih_l0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Training iteration:   0%|          | 0/131 [00:00<?, ?it/s]

Epoch:   7%|▋         | 1/15 [00:31<07:19, 31.36s/it]

nan


Training iteration:   0%|          | 0/131 [00:00<?, ?it/s]

Epoch:  13%|█▎        | 2/15 [01:01<06:40, 30.83s/it]

nan


Training iteration:   0%|          | 0/131 [00:00<?, ?it/s]

Epoch:  20%|██        | 3/15 [01:32<06:10, 30.92s/it]

nan


Training iteration:   0%|          | 0/131 [00:00<?, ?it/s]

Epoch:  27%|██▋       | 4/15 [02:05<05:46, 31.53s/it]

nan


Training iteration:   0%|          | 0/131 [00:00<?, ?it/s]