<a href="https://colab.research.google.com/github/GiovanniSorice/Hate_Speech_Detection/blob/main/AlBerto_with_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AlBERTo Hate Speech Classifier 

In [3]:
!pip install ekphrasis
!pip install transformers
import datetime
import sys
import warnings
warnings.filterwarnings("ignore")

#for code working
import tensorflow as tf
import re
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import numpy as np
import pandas as pd 
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from transformers import BertModel, Trainer, TrainingArguments, BertTokenizer, glue_convert_examples_to_features, AutoModel, AlbertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
#Prepare and import BERT modules
import subprocess
subprocess.call(["git", "clone", "https://github.com/google-research/bert","bert_repo"])

if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
import modeling
import tokenization
from sklearn.model_selection import StratifiedKFold

Collecting ekphrasis
[?25l  Downloading https://files.pythonhosted.org/packages/92/e6/37c59d65e78c3a2aaf662df58faca7250eb6b36c559b912a39a7ca204cfb/ekphrasis-0.5.1.tar.gz (80kB)
[K     |████                            | 10kB 19.1MB/s eta 0:00:01[K     |████████▏                       | 20kB 20.4MB/s eta 0:00:01[K     |████████████▎                   | 30kB 10.5MB/s eta 0:00:01[K     |████████████████▍               | 40kB 5.7MB/s eta 0:00:01[K     |████████████████████▌           | 51kB 6.9MB/s eta 0:00:01[K     |████████████████████████▌       | 61kB 8.1MB/s eta 0:00:01[K     |████████████████████████████▋   | 71kB 8.7MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.3MB/s 
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting ujson
[?25l  Downloading https://files.pythonhosted.org/packages/17/4e/50e8e4cf5f00b537095711c2c

In [4]:
from google.colab import drive
# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


## Load the dataset 

In [5]:
# directory name 
input_dir = '/content/drive/My Drive/HLT/clean_dataset_training/' 
AlBERTo_path = '/content/drive/MyDrive/HLT/alberto_uncased_L-12_H-768_A-12_italian_ckpt/'
input_dir_not_clean = '/content/drive/My Drive/HLT/dataset_training/' 

# Spec
pd.set_option("display.max_colwidth", None)

In [6]:
raw_tsv_file = open(input_dir_not_clean+"haspeede2_dev_taskAB.tsv")
raw_dataset = pd.read_csv(raw_tsv_file,sep='\t')
raw_dataset.rename(columns={"text ": "text"}, inplace=True)

In [7]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag"},
    fix_html=True,  # fix HTML tokens

    unpack_hashtags=True,  # perform word segmentation on hashtags

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    dicts=[emoticons]
)

class AlBERTo_Preprocessing(object):
    def __init__(self, do_lower_case=True, **kwargs):
        self.do_lower_case = do_lower_case

    def preprocess(self, text):
        if self.do_lower_case:
            text = text.lower()
        text = str(" ".join(text_processor.pre_process_doc(text)))
        text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w)\1{2,}', r'\1\1', text)
        text = re.sub(r'^\s', '', text)
        text = re.sub(r'\s$', '', text)
        return text

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/english/counts_1grams.txt
Reading english - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/english/counts_2grams.txt
Reading english - 1grams ...


In [8]:
AlBERTo_Preprocess = AlBERTo_Preprocessing(do_lower_case=True)
def preproc(text):
    return AlBERTo_Preprocess.preprocess(text)

In [9]:
raw_dataset['text_1'] = raw_dataset['text'].apply(preproc)

In [10]:
tsv_file = open(input_dir+"training_dataset.csv")

dataset = pd.read_csv(tsv_file,sep=',')

In [None]:
import pprint

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())
  

# Prepare sentences to be converted 

In [10]:
X_train, X_valid, Y_train, Y_valid = train_test_split(raw_dataset['text_1'], raw_dataset['hs'], test_size=0.2, random_state=128)

In [11]:
X_train_values=list(X_train.values)
X_valid_values=list(X_valid.values)

# Tokenizer 

In [21]:
import tensorflow as tf
tokenizer = BertTokenizer.from_pretrained('m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1108715.0, style=ProgressStyle(descript…




In [13]:
encoding_train = tokenizer(X_train_values, padding=True, truncation=True, max_length=64)
encoding_valid = tokenizer(X_valid_values, padding=True, truncation=True, max_length=64)

In [14]:
input_ids_train = torch.tensor(encoding_train['input_ids'])
attention_mask_train = torch.tensor(encoding_train['attention_mask'])
input_ids_valid = torch.tensor(encoding_valid['input_ids'])
attention_mask_valid = torch.tensor(encoding_valid['attention_mask'])

In [15]:
# Convert other data types to torch.Tensor
train_labels = torch.tensor(Y_train.values)
val_labels = torch.tensor(Y_valid.values)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(input_ids_train, attention_mask_train, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(input_ids_valid, attention_mask_valid, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Load Pre-Trained Model 

In [11]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [12]:
# Create the BertClassfier class
import torch.nn as nn
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, Hidden_1, Hidden_2, D_out = 768, 256, 64, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, Hidden_2),
            nn.ReLU(),
            #nn.Dropout(0.5),
            #nn.Linear(Hidden_1, Hidden_2),
            #nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(Hidden_2, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

    def freeze(self, freeze_bert=False):
        # Freeze or unfreeze the BERT model
        for param in self.bert.parameters():
            param.requires_grad = not freeze_bert


In [13]:
OUTPUT_DIR = AlBERTo_path + 'output'
#Inizialize AlBERTo
INIT_CHECKPOINT = AlBERTo_path + 'alberto_model.ckpt'
#SET THE PARAMETERS
PREDICT_BATCH_SIZE = 8
MAX_SEQ_LENGTH = 128
LEARNING_RATE = 2e-5
label_list = [0, 1]
#BERT_CONFIG= modeling.BertConfig.from_json_file(AlBERTo_path + "config.json")

#SET THE PARAMETERS FOR TRAINING 
TRAIN_BATCH_SIZE = 128
PREDICT_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 512
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
MAX_SEQ_LENGTH = 64
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500
 

In [14]:
#inizialize parameters
num_train_steps = int(len(input_ids_train) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
num_warmup_steps = int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)
print(num_train_steps)
print(num_warmup_steps)

NameError: ignored

In [43]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(len_data, epochs=4, num_warmup_steps = 0):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=2e-5,  
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len_data * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [46]:
import random
import time
from sklearn.metrics import f1_score
from torch import tensor
# Specify loss function
loss_fn = nn.CrossEntropyLoss()
sig = nn.Sigmoid()

def f1_score_func(preds, labels):
    preds_copy = torch.tensor(preds)
    preds_flat = np.argmax(preds_copy.cpu(), axis=1).flatten()
    labels_flat = labels.cpu().flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def set_seed(seed_value=128, random_seed = False):
    """Set seed for reproducibility.
    """
    if not random_seed:
      random.seed(seed_value)
      np.random.seed(seed_value)
      torch.manual_seed(seed_value)
      torch.cuda.manual_seed_all(seed_value)
    else:
      random.seed()
      np.random.seed()
      torch.manual_seed(random.getrandbits(32))
      torch.cuda.manual_seed_all(random.getrandbits(32))


def train(model, train_dataloader, optimizer, scheduler, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'F1 Train':^9} | {'Val Loss':^10} | {'Val Acc':^9} | {'F1 Val':^9} | {'Elapsed':^9}")
        print("-"*90)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts, f1_value_train_batch, f1_value_train_tot  = 0, 0, 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            #b_labels = b_labels *1.0
            #b_labels = b_labels.unsqueeze(1)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            f1_value_train_batch+= f1_score_func(logits, b_labels) 
            f1_value_train_tot+= f1_score_func(logits, b_labels) 

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {f1_value_train_batch / batch_counts:^9.2f} | {'-':^10} | {'-':^9} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts, f1_value_train_batch = 0, 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        avg_f1_value = f1_value_train_tot / len(train_dataloader)

        print("-"*90)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy, f1_value_validation = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {avg_f1_value:^9.2f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {f1_value_validation:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*90)
        print("\n")
    
    print("Training complete!")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []
    f1_value = []
    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        #b_labels = b_labels *1.0
        #b_labels = b_labels.unsqueeze(1)
        # print("logits: {} b_labels: {}".format(sig(logits), b_labels))
        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        #print("logits: {0} b_labels: {1} preds: {2}".format(sig(logits), b_labels, preds))
        #print("preds: {0}".format( preds))
        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)
        f1_value.append(f1_score_func(logits, b_labels))

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    f1_value = np.mean(f1_value) 
    return val_loss, val_accuracy, f1_value

def evaluate_for_kfold(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()
    preds = []
    # Tracking variables
    val_accuracy = []
    val_loss = []
    f1_value = []
    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        #b_labels = b_labels *1.0
        #b_labels = b_labels.unsqueeze(1)
        # print("logits: {} b_labels: {}".format(sig(logits), b_labels))
        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds+=(torch.argmax(logits, dim=1).flatten().tolist())
    
    print(preds)
    print(type(preds))
    return preds

In [15]:
set_seed(128)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=30, num_warmup_steps = num_warmup_steps)

NameError: ignored

In [None]:
bert_classifier.freeze(True)

In [None]:
train(bert_classifier, train_dataloader, val_dataloader, epochs=3, evaluation=True)

In [79]:
evaluate(bert_classifier, train_dataloader)

(0.366147374331254, 84.53191167574109, 0.8458032125421598)

In [80]:
evaluate(bert_classifier, val_dataloader)

(0.47239005392373995, 78.2218992248062, 0.7840113212449744)

In [16]:
input_test_dir = "/content/drive/My Drive/HLT/dataset_test_evalita/"

csv_test_tweets_file = open(input_test_dir+"haspeede2_reference_taskAB-tweets.tsv")

testset_tweets = pd.read_csv(csv_test_tweets_file,sep='\t', header=None)
testset_tweets.rename(columns={0: "id"}, inplace=True)
testset_tweets.rename(columns={1: "text"}, inplace=True)
testset_tweets.rename(columns={2: "hs"}, inplace=True)
testset_tweets.rename(columns={3: "stereotype"}, inplace=True)

csv_test_news_file = open(input_test_dir+"haspeede2_reference_taskAB-news.tsv")

testset_news = pd.read_csv(csv_test_news_file,sep='\t', header=None)
testset_news.rename(columns={0: "id"}, inplace=True)
testset_news.rename(columns={1: "text"}, inplace=True)
testset_news.rename(columns={2: "hs"}, inplace=True)
testset_news.rename(columns={3: "stereotype"}, inplace=True)


In [17]:
testset_tweets['text_1'] = testset_tweets['text'].apply(preproc)
testset_news['text_1'] = testset_news['text'].apply(preproc)

In [22]:
encoding_test_tweets = tokenizer(list(testset_tweets["text_1"].values), padding=True, truncation=True, max_length=64)
encoding_test_news = tokenizer(list(testset_news["text_1"].values), padding=True, truncation=True, max_length=64)

In [23]:
input_ids_test_tweets = torch.tensor(encoding_test_tweets['input_ids'])
attention_mask_test_tweets = torch.tensor(encoding_test_tweets['attention_mask'])
input_ids_test_news = torch.tensor(encoding_test_news['input_ids'])
attention_mask_test_news = torch.tensor(encoding_test_news['attention_mask'])

In [24]:
# Convert other data types to torch.Tensor
test_tweets_labels = torch.tensor(list(testset_tweets["hs"].values))
test_news_labels = torch.tensor(list(testset_news["hs"].values))

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
test_tweets_data = TensorDataset(input_ids_test_tweets, attention_mask_test_tweets, test_tweets_labels)
test_tweets_sampler = RandomSampler(test_tweets_data)
test_tweets_dataloader = DataLoader(test_tweets_data, sampler=test_tweets_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
test_news_data = TensorDataset(input_ids_test_news, attention_mask_test_news, test_news_labels)
test_news_sampler = RandomSampler(test_news_data)
test_news_dataloader = DataLoader(test_news_data, sampler=test_news_sampler, batch_size=batch_size)

In [102]:
evaluate(bert_classifier, test_tweets_dataloader)

(0.692391487210989, 74.88020833333333, 0.7478516687101389)

In [103]:
evaluate(bert_classifier, test_news_dataloader)

(1.1188670955598354, 70.8203125, 0.6650061870995638)

In [25]:
X_train_kfold_values = raw_dataset['text_1'].values

In [26]:
encoding_dataset_kfold = tokenizer(list(X_train_kfold_values), padding=True, truncation=True, max_length=64)

In [27]:
input_ids_dataset_kfold = torch.tensor(encoding_dataset_kfold['input_ids'])
attention_mask_dataset_kfold  = torch.tensor(encoding_dataset_kfold['attention_mask'])

In [28]:
# Convert other data types to torch.Tensor
dataset_kfold_labels = torch.tensor(raw_dataset['hs'].values)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

In [48]:
def train_test_model_with_kfold(hparams):
  number_of_splits = 5
  cv_kfold = StratifiedKFold(n_splits=number_of_splits, shuffle=True, random_state=100)
  models = []
  
  for train_index, validation_index in cv_kfold.split(input_ids_dataset_kfold, dataset_kfold_labels):
    set_seed(random_seed=True)
    num_train_steps = int(len(input_ids_dataset_kfold) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)+1
    num_warmup_steps = int(NUM_TRAIN_EPOCHS * WARMUP_PROPORTION)
    bert_classifier, optimizer, scheduler = initialize_model(epochs=30, num_warmup_steps= num_warmup_steps, len_data = len(input_ids_dataset_kfold))

    # Create the DataLoader for our training set
    train_data_kfold = TensorDataset(input_ids_dataset_kfold[train_index], attention_mask_dataset_kfold[train_index], dataset_kfold_labels[train_index])
    train_sampler_kfold = RandomSampler(train_data_kfold)
    train_dataloader_kfold = DataLoader(train_data_kfold, sampler=train_sampler_kfold, batch_size=batch_size)

    # Create the DataLoader for our validation set
    val_data_kfold = TensorDataset(input_ids_dataset_kfold[validation_index], attention_mask_dataset_kfold[validation_index], dataset_kfold_labels[validation_index])
    val_sampler_kfold = SequentialSampler(val_data_kfold)
    val_dataloader_kfold = DataLoader(val_data_kfold, sampler=val_sampler_kfold, batch_size=batch_size)

    train(bert_classifier, train_dataloader_kfold, optimizer, scheduler,val_dataloader_kfold, epochs=1, evaluation=True,)

    #bert_classifier.freeze(True)

    #train(bert_classifier, train_dataloader_kfold, val_dataloader_kfold, epochs=7, evaluation=True)

    models.append(bert_classifier)

  return models

In [30]:
def predict_with_ensemble(models, test_dataloader):
  # make predictions
  results = []
  y_predict = [np.squeeze(evaluate_for_kfold(model, test_dataloader)) for model in models]
  # sum across ensemble members
  y_predict = np.array(y_predict)

  for i in range(y_predict.shape[1]):
    counts = np.bincount(y_predict[:,i])
    results.append(np.argmax(counts))
  # argmax across classes
  return results

In [31]:
def run_with_kfold(hparams = None):

    models = train_test_model_with_kfold(hparams)
    y_test_pred_tweets = predict_with_ensemble(models, test_tweets_dataloader)
    y_test_pred_news = predict_with_ensemble(models, test_news_dataloader)

    print("f1_score test tweets: {}".format(f1_score(y_test_tweets, y_test_pred_tweets,average="macro")))
    print("f1_score test news: {}".format(f1_score(y_test_news, y_test_pred_news,average="macro")))
    return models

In [49]:
models = run_with_kfold()

Start training...

 Epoch  |  Batch  |  Train Loss  | F1 Train  |  Val Loss  |  Val Acc  |  F1 Val   |  Elapsed 
------------------------------------------------------------------------------------------
   1    |   20    |   0.615713   |   0.58    |     -      |     -     |     -     |   8.20   
   1    |   40    |   0.540363   |   0.70    |     -      |     -     |     -     |   7.89   
   1    |   60    |   0.459854   |   0.79    |     -      |     -     |     -     |   7.99   
   1    |   80    |   0.464728   |   0.77    |     -      |     -     |     -     |   8.12   
   1    |   100   |   0.485464   |   0.77    |     -      |     -     |     -     |   8.19   
   1    |   120   |   0.427275   |   0.78    |     -      |     -     |     -     |   8.24   
   1    |   140   |   0.447037   |   0.79    |     -      |     -     |     -     |   8.16   
   1    |   160   |   0.455519   |   0.78    |     -      |     -     |     -     |   8.09   
   1    |   170   |   0.493397   |   0.76   

RuntimeError: ignored

In [50]:
!nvidia-smi

Thu Mar 18 12:16:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    32W /  70W |  15098MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces