In [1]:
! pip install nltk
! pip install pandas
! pip install scikit-learn




In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from string import punctuation
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

[nltk_data] Downloading package stopwords to /Users/guna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/guna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/guna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/guna/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [452]:
train_df = pd.read_csv('Dataset/train.csv')

In [453]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [480]:
%%capture
def data_cleanup(train_df):
    train_df['text'] = train_df['text'].str.lower()
    train_df['text'] = train_df['text'].str.strip()
    train_df['text'] = train_df['text'].replace(to_replace ='http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='\?*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(RT|rt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='@[a-z,_]*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*:[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*\.[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(utc|gmt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='_[\S]', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='&amp;?', value = 'and', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&lt;', value = '<', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&gt;', value = '>', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='[ ]{2, }', value = ' ', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([^\w\d ]+)', value = '', regex = True)
    return train_df['text']

In [481]:
%%capture
train_df = pd.read_csv('Dataset/train.csv')
train_df['text'] = data_cleanup(train_df)

In [477]:
train_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this eahquake may ...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,the out of control wild fires in california ...,1
7610,10871,,,m 5km s of volcano hawaii,1
7611,10872,,,police investigating after an ebike collided w...,1


In [482]:
#Split training dataset
tweet_texts = train_df['text']
class_labels = train_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [492]:
#Text Preprocessor
def preprocessing(text):
   word_lemma = []
   tweet_tokenize = TweetTokenizer()
   tokens = tweet_tokenize.tokenize((text).lower())
   tokens = [w for w in tokens if w not in punctuation and not w.isdigit() and not len(w) < 3]
   stop_words = stopwords.words ('english')
   tweet_without_stopwords = [t for t in tokens if t not in stop_words]
   text = " ".join (tweet_without_stopwords)
   word_lemma = [WordNetLemmatizer().lemmatize(t) for t in tweet_tokenize.tokenize(text)]
   pp_text = " ".join (word_lemma)
   return pp_text

In [8]:
def get_performance_score(self, actual_label : list, predicted_label : list):
    '''Function to calculate the performance metric using sklearn.
    
    Parameters
    ----------
    actual_label : list
      Actual(Ground Truth) class label from the dataset.
    predicted_label : pd.DataFrame
      Class label predicted by the model
    
    Return
    ------
    f1_score : float
    accuracy : float
    precision : float
    recall : float
    AUROC : float
    '''
    precision = metrics.precision_score(actual_label, predicted_label, pos_label=1)
    recall = metrics.recall_score(actual_label, predicted_label,pos_label=1)
    AUROC = metrics.roc_auc_score(actual_label, predicted_label)
    accuracy = metrics.accuracy_score(actual_label, predicted_label)
    f1_score = metrics.f1_score(actual_label, predicted_label,pos_label=1)
    metrics_list = [f1_score, accuracy, precision, recall, AUROC]
    metrics_list = pd.DataFrame(metrics_list).T
    metrics_df = metrics_list.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
    return metrics_df

## Variable definitions
 - train_tweets - Preprocessed tweets for training
 - test_tweets - Preprocessed tweets for testing
 - train_labels - class label for training tweets
 - test_labels - class label for test tweets

## Baseline
1. Implement traditional model(MultinomialNB, LogisticRegression, SVC, KNeighborsClassifier) from sklearn
2. Train and test the default model without tuning hyperparameter values
3. Use grid search(GridSearchCV) from sklearn to identify best values for hyperparameters
4. Train the model with best hypermeter values and test it on test set(test_tweets)

## BERTweet

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install evaluate
!pip install numpy
!pip install accelerate
!pip install emoji==0.6.0

In [204]:
from transformers import BertForSequenceClassification
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import BertweetTokenizer
from transformers import AlbertTokenizer, AlbertModel
from transformers import AutoModel
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer

In [10]:
#Split training dataset
tweet_texts = train_tweets
class_labels = train_labels
train_tweets, dev_tweets, train_labels, dev_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [None]:
train_cols = [pd.Series(train_tweets, name='text'), pd.Series(train_labels, name='labels')]
train_df = pd.concat(train_cols, axis = 1)
dev_cols = [pd.Series(dev_tweets, name='text'), pd.Series(dev_labels, name='labels')]
dev_df = pd.concat(dev_cols, axis = 1)
test_cols = [pd.Series(test_tweets, name='text'), pd.Series(test_labels, name='labels')]
test_df = pd.concat(test_cols,axis = 1)

In [11]:
model_name = "vinai/bertweet-base"

In [12]:
id2text = {0: "not_disaster", 1: "disaster"}
text2id = {"not_disaster": 0, "disaster": 1}

In [61]:
len(train_df)

4872

In [144]:
import torch
mps_device = torch.device("mps")

In [212]:
import torch
from torch.utils.data import TensorDataset
max_length = 32
trucate = True
padding='max_length'
BATCH_SIZE = 32
def tweet_tokenize(tweet_text):
    #  return tokenizer(input['text'], padding=padding, max_length=max_length, truncation=trucate)
    tokenizer = BertweetTokenizer.from_pretrained(model_name)
    IDs = tokenizer.encode_plus(
                            tweet_text, 
                            add_special_tokens = True, 
                            max_length = max_length, 
                            padding = padding,
                            truncation=trucate,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                            )
    return IDs
def mapping(input_ids, attention_masks, token_type_ids, label):
    map_dict = {}
    map_dict['input_ids'] = input_ids
    map_dict['token_type_ids'] = token_type_ids
    map_dict['attention_mask'] = attention_masks
    return map_dict, label 

def build_ds(input_df):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
        
    # for text, label in input_df:
    for index, row in input_df.iterrows():
        text = row['text']
        label = row['labels']
        classifier_input = tweet_tokenize(text)
        input_ids_list.append(classifier_input['input_ids'])
        token_type_ids_list.append(classifier_input['token_type_ids'])
        attention_mask_list.append(classifier_input['attention_mask'])
        label_list.append([label])
    input_ids = torch.cat(input_ids_list, dim=0)
    attention_masks = torch.cat(attention_mask_list, dim=0)
    token_type_ids = torch.cat(token_type_ids_list, dim=0)
    labels = torch.tensor(label_list)
    mapped_dataset = TensorDataset(input_ids, attention_masks,token_type_ids, labels)
    return mapped_dataset

train_dataset_mapped = build_ds(train_df.head(n=1000))

dev_dataset_mapped = build_ds(dev_df.head(n=1000))

In [None]:
!pip install torch torchvision torchaudio

In [210]:
# classifier = AutoModelForSequenceClassification.from_pretrained(
#     model_name, num_labels=2, id2label=id2label, label2id=label2id
# )

classifier = BertForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2text, label2id=text2id
)

classifier = classifier.to(mps_device)

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.la

In [194]:
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup
optimizer = AdamW(classifier.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [195]:
classifier.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05,

In [99]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)

padding = True
max_length = 128
trucate = True

tokenizer = BertweetTokenizer.from_pretrained(model_name)

def preprocessor(input):
     token_dict = tokenizer.encode_plus(input['text'], padding='max_length', max_length=max_length, truncation=trucate,return_attention_mask = True)
     return token_dict

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
!pip install tensorflow

In [100]:
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
train_map = train_dataset.map(preprocessor)
dev_map = eval_dataset.map(preprocessor)
test_map = test_dataset.map(preprocessor)

Map: 100%|██████████| 4872/4872 [00:01<00:00, 4635.84 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 4155.05 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 4428.56 examples/s]


In [76]:
train_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 4872
})

In [17]:
f1 = evaluate.load("f1")
def calculate_f1(labels):
    predicted, actual = labels
    predicted = np.argmax(predicted, axis=1)
    return f1.compute(predictions=predicted, references=actual)

In [196]:
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy="no",
    save_strategy="no",
    load_best_model_at_end=True,
    greater_is_better=True,
    num_train_epochs=1,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,
    use_mps_device=True

)

trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_dataset_mapped,
    eval_dataset=dev_dataset_mapped,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_f1,
)



In [176]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

In [213]:
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset_mapped,  # The training samples.
            sampler = RandomSampler(train_dataset_mapped), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            dev_dataset_mapped, # The validation samples.
            sampler = SequentialSampler(dev_dataset_mapped), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [214]:
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [215]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [209]:
import time
import datetime
import gc
import random

In [181]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [216]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    classifier.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(mps_device)
        b_input_mask = batch[1].to(mps_device)
        b_token_type_ids = batch[2].to(mps_device)
        b_labels = batch[3].to(mps_device)
        optimizer.zero_grad()
        output = classifier(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask,
                             labels=b_labels)        
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(classifier.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    classifier.eval()
    # Tracking variables 
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(mps_device)
        b_input_mask = batch[1].to(mps_device)
        b_token_type_ids = batch[2].to(mps_device)
        b_labels = batch[3].to(mps_device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        
            output= classifier(b_input_ids, 
                                   token_type_ids=b_token_type_ids, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(classifier, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.72
  Training epcoh took: 0:00:12

Running Validation...
  Accuracy: 0.42

Training...

  Average training loss: 0.73
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.42

Training...

  Average training loss: 0.73
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.42

Training...

  Average training loss: 0.73
  Training epcoh took: 0:00:09

Running Validation...
  Accuracy: 0.42

Training complete!
Total training took 0:00:51 (h:mm:ss)


In [27]:
from sklearn import metrics
predictions = None
predictions = []
actual_label = test_df['labels']
predictions_prob = trainer.predict(test_map)
predictions =  predictions_prob.predictions
# print(predictions)
# predictions = (predictions_prob > 0.5).int()
predictions = np.argmax(predictions,axis=1)
print(predictions)
predictions_lables = np.array(predictions, dtype = int) #predict categories for the test data using the above trained classifier
actual_label = np.array(actual_label, dtype = int)
bert_F1 = round(metrics.f1_score(actual_label, predictions_lables),3) #calculates F1 metric for naivebayes classifier
bert_Acc = round(metrics.accuracy_score(actual_label, predictions_lables),3)
bert_AUROC = round(metrics.roc_auc_score(actual_label, predictions_lables),3)
bert_Precision = round(metrics.precision_score(actual_label, predictions_lables),3)
bert_Recall = round(metrics.recall_score(actual_label, predictions_lables),3)
cm = metrics.confusion_matrix(actual_label, predictions_lables) #confusion matrix
print("\n")
print("confusion matric for learning rate: " + str(3e-5))
print("\n",cm)
print("\n")
print("Labelwise performance metrics for learning rate: " +str(3e-5))
print("\n",metrics.classification_report(actual_label, predictions_lables, target_names=["Student","LLM"]))
outfile = pd.DataFrame()
outfile.insert(0, "Tweet text", test_dataset)
outfile.insert(1, "Actual Label", actual_label)
outfile.insert(2, "Predicted Label", predictions_lables)
filename = "Task_A"
outfile.to_csv("Predicted_output_"+filename+"_tabel_output_class.csv", index=False, header=False)
bert_F1, bert_AUROC,bert_Acc, bert_Precision,bert_Recall

100%|██████████| 305/305 [00:16<00:00, 18.50it/s]


[1 1 1 ... 1 0 0]


confusion matric for learning rate: 3e-05

 [[710 159]
 [221 433]]


Labelwise performance metrics for learning rate: 3e-05

               precision    recall  f1-score   support

     Student       0.76      0.82      0.79       869
         LLM       0.73      0.66      0.70       654

    accuracy                           0.75      1523
   macro avg       0.75      0.74      0.74      1523
weighted avg       0.75      0.75      0.75      1523



(0.695, 0.74, 0.75, 0.731, 0.662)

In [18]:
classifier.train()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(130, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [23]:
import torch
train_label = dict(train_df['labels'].value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))])
criterion = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean')