predict whether a sentence contains claim, question, experience, experience based on claims

In [None]:
! pip install bert-for-sequence-classification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-sequence-classification
  Downloading bert_for_sequence_classification-0.0.4-py3-none-any.whl (14 kB)
Collecting transformers>=4.2.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 13.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 58.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, bert-for-sequence-classification
Successfully installed bert-for-sequence-classification-0.0.4 huggingface-hub-0.10.1 tokenizers-0.13.2 transformers-4.24.0


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import json

from transformers import AutoModel, AutoTokenizer

from bert_clf import BertCLF, train_evaluate, predict_metrics, prepare_data_notebook, prepare_dataset
from bert_clf.utils import set_global_seed

In [None]:
from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/UP_w22/PM/task 8/data/st1_data_train_sent.csv')
# str1_sample_train_sent.csv
df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,post_id,subreddit_id,Sentence,Label,Components
0,0,s1jpia,t5_2s23e,De-Nial\nI wrote this a few years ago and just...,none,{}
1,1,s1jpia,t5_2s23e,I thought I'd share...\n\n&#x200B;\n\nWhen I w...,none,{}
2,2,s1jpia,t5_2s23e,Like the opening line of the old Frank Sanatra...,none,{}
3,3,s1jpia,t5_2s23e,"I was getting ready to graduate high school, I...",none,{}
4,4,s1jpia,t5_2s23e,Growing up I was an only child with older pare...,none,{}
...,...,...,...,...,...,...
127354,127354,ri83g1,t5_2r876,Just how effective these drugs are and how do ...,question,Just how effective these drugs are and how do ...
127355,127355,ri83g1,t5_2r876,Edit2: Thank you.,none,{}
127356,127356,ri83g1,t5_2r876,It is wonderful that there are drugs that can ...,none,{}
127357,127357,ri83g1,t5_2r876,I am glad that Trikafta exists NOW.,none,{}


In [None]:

df['Label'] = df['Label'].str.replace('claim_per_exp','Arg')
df['Label'] = df['Label'].str.replace('claim','Arg')
df['Label'] = df['Label'].str.replace('per_exp','Arg')
df['Label'] = df['Label'].str.replace('question','Arg')
df['Label'] = df['Label'].str.replace('none','O')
df

Unnamed: 0.1,Unnamed: 0,post_id,subreddit_id,Sentence,Label,Components
0,0,s1jpia,t5_2s23e,De-Nial\nI wrote this a few years ago and just...,O,{}
1,1,s1jpia,t5_2s23e,I thought I'd share...\n\n&#x200B;\n\nWhen I w...,O,{}
2,2,s1jpia,t5_2s23e,Like the opening line of the old Frank Sanatra...,O,{}
3,3,s1jpia,t5_2s23e,"I was getting ready to graduate high school, I...",O,{}
4,4,s1jpia,t5_2s23e,Growing up I was an only child with older pare...,O,{}
...,...,...,...,...,...,...
127354,127354,ri83g1,t5_2r876,Just how effective these drugs are and how do ...,Arg,Just how effective these drugs are and how do ...
127355,127355,ri83g1,t5_2r876,Edit2: Thank you.,O,{}
127356,127356,ri83g1,t5_2r876,It is wonderful that there are drugs that can ...,O,{}
127357,127357,ri83g1,t5_2r876,I am glad that Trikafta exists NOW.,O,{}


In [None]:
df_train = pd.DataFrame(columns=['Sentence', 'Label'])
df_test = pd.DataFrame(columns=['Sentence', 'Label'])
df_value = pd.DataFrame(columns=['Sentence', 'Label'])
split = int(127359*0.8)
for index, row in df.iterrows():
  if index < split:
    df_train = df_train.append({'Sentence': row['Sentence'], 'Label': row["Label"]},ignore_index=True)

  if index >= split:
    df_test = df_test.append({'Sentence': row['Sentence'], 'Label': row["Label"]},ignore_index=True)

  df_value = df_value.append({'Sentence': row['Sentence'], 'Label': row["Label"]},ignore_index=True)


print(df_train.shape, df_value.shape, df_test.shape)

(101887, 2) (127359, 2) (25472, 2)


In [None]:
split

101887

In [None]:
df_test['Label'].value_counts()

O      21594
Arg     3878
Name: Label, dtype: int64

In [None]:
df_train['Label'].value_counts()

O      86087
Arg    15800
Name: Label, dtype: int64

In [None]:
df_value['Label'].value_counts()

O      107681
Arg     19678
Name: Label, dtype: int64

## Transformer Language Model

In [None]:
config = dict(
    transformer_model = dict(
        model = "chkla/roberta-argument",
        path_to_state_dict = False,
        device = 'cuda',
        dropout = 0.2,
        learning_rate = 2e-5,
        batch_size = 16,
        shuffle = True,
        maxlen = 128,
    ),
    data = dict(
        train_data_path = df_train,
        test_data_path = df_value,
        text_column = "Sentence",
        target_column = "Label",
        random_state = 52,
        test_size = 0.3,
        stratify=True
    ),
    training = dict (
    save_state_dict = False, # if False the model will be saved using torch.save()
        # and should be loaded like this: model = torch.load()
        # you will have to install the library to do so
    early_stopping = True,
    delta = 0.001,
    patience = 7,
    num_epochs = 2,
    average_f1 = 'macro',
    other_metrics = ['micro', 'weighted'],
    output_dir = "../results/",
    class_weight = True
    )
)

In [None]:
set_global_seed(seed=config['data']['random_state'])
os.makedirs(config['training']['output_dir'], exist_ok=True)

In [None]:
device = torch.device(config['transformer_model']['device'])
tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=config['transformer_model']["model"]
    )
model_bert = AutoModel.from_pretrained(
    pretrained_model_name_or_path=config['transformer_model']["model"]
).to(device)

#for param in model_bert.parameters():
    #param.requires_grad = False

Some weights of the model checkpoint at chkla/roberta-argument were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at chkla/roberta-argument and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
id2label, train_texts, valid_texts, train_targets, valid_targets = prepare_data_notebook(
    config=config, train_df = df_train, test_df = df_value
)

In [None]:
id2label

{0: 'O', 1: 'Arg'}

In [None]:
model = BertCLF(
    pretrained_model=model_bert,
    tokenizer=tokenizer,
    id2label=id2label,
    dropout=config['transformer_model']['dropout'],
    device=device     
    )

In [None]:
model = model.to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=float(config['transformer_model']['learning_rate']))
criterion = nn.NLLLoss()

training_generator, valid_generator = prepare_dataset(
    tokenizer=tokenizer,
    train_texts=train_texts,
    train_targets=train_targets,
    valid_texts=valid_texts,
    valid_targets=valid_targets,
    config=config
)

In [None]:
model = train_evaluate(
    model=model,
    training_generator=training_generator,
    valid_generator=valid_generator,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=config['training']['num_epochs'],
    average=config['training']['average_f1'],
    config=config
)

==== Epoch 1 out of 2 ====


Training loop: 100%|██████████| 6368/6368 [35:57<00:00,  2.95it/s]
Evaluating loop: 100%|██████████| 7960/7960 [13:46<00:00,  9.63it/s]


Train F1: 0.48963719872176925
Eval F1: 0.49204893958682233

Train F1 micro: 0.8447890494137353
Eval F1 micro: 0.8458212939698493

Train F1 weighted: 0.7769388191822826
Eval F1 weighted: 0.7783980411929774

==== Epoch 2 out of 2 ====


Training loop: 100%|██████████| 6368/6368 [36:03<00:00,  2.94it/s]
Evaluating loop: 100%|██████████| 7960/7960 [13:42<00:00,  9.68it/s]


Train F1: 0.4914240807642932
Eval F1: 0.4905305359408732

Train F1 micro: 0.8447491363065327
Eval F1 micro: 0.8454920435510888

Train F1 weighted: 0.777291114547064
Eval F1 weighted: 0.7773853267782201




Computing final metrics...: 100%|██████████| 7960/7960 [13:31<00:00,  9.80it/s]


              precision    recall  f1-score   support

         Arg       0.00      0.00      0.00     19678
           O       0.85      1.00      0.92    107681

    accuracy                           0.85    127359
   macro avg       0.42      0.50      0.46    127359
weighted avg       0.71      0.85      0.77    127359



In [None]:
model.to('cpu')

BertCLF(
  (pretrained_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [None]:
preds = []
for i,j in zip(df_test['Sentence'], df_test['Label']):
    preds.append([model.predict(i), j, i])

In [None]:
pred = []
for i in preds:
    pred.append(i[0])

true = []
for m in preds:
    true.append(m[1])

In [None]:
from sklearn.metrics import classification_report
# roberta- arg on sentence level, USElecDeb corpus, task 1

target_names = ['class 0', 'class 1']
print(classification_report(true, pred, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     class 0      0.000     0.000     0.000      3878
     class 1      0.848     1.000     0.918     21594

    accuracy                          0.848     25472
   macro avg      0.424     0.500     0.459     25472
weighted avg      0.719     0.848     0.778     25472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# O      107681
# Arg     19678