predict whether a sentence contains claim, question, experience, experience based on claims

In [1]:
! pip install bert-for-sequence-classification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-sequence-classification
  Downloading bert_for_sequence_classification-0.0.4-py3-none-any.whl (14 kB)
Collecting transformers>=4.2.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers, bert-for-

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import json

from transformers import AutoModel, AutoTokenizer

from bert_clf import BertCLF, train_evaluate, predict_metrics, prepare_data_notebook, prepare_dataset
from bert_clf.utils import set_global_seed

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')
# just use df = pd.read_csv('st1_data_train_sent.csv') when running in the repo
df = pd.read_csv('/content/drive/My Drive/UP_w22/PM/task 8/data/st1_data_train_sent.csv')
df

Unnamed: 0.1,Unnamed: 0,post_id,subreddit_id,Sentence,Label,Components
0,0,s1jpia,t5_2s23e,De-Nial\nI wrote this a few years ago and just...,none,{}
1,1,s1jpia,t5_2s23e,I thought I'd share...\n\n&#x200B;\n\nWhen I w...,none,{}
2,2,s1jpia,t5_2s23e,Like the opening line of the old Frank Sanatra...,none,{}
3,3,s1jpia,t5_2s23e,"I was getting ready to graduate high school, I...",none,{}
4,4,s1jpia,t5_2s23e,Growing up I was an only child with older pare...,none,{}
...,...,...,...,...,...,...
127354,127354,ri83g1,t5_2r876,Just how effective these drugs are and how do ...,question,Just how effective these drugs are and how do ...
127355,127355,ri83g1,t5_2r876,Edit2: Thank you.,none,{}
127356,127356,ri83g1,t5_2r876,It is wonderful that there are drugs that can ...,none,{}
127357,127357,ri83g1,t5_2r876,I am glad that Trikafta exists NOW.,none,{}


In [5]:
# use this for combined binary model
labels = ['claim_per_exp', 'claim','per_exp','question']

i = 0
for idx, row in df.iterrows(): 
  if df.loc[idx, 'Label'] in labels:
    df.loc[idx, 'Label'] = 'Arg'
  else:
    df.loc[idx, 'Label'] = 'O'

# use this for separated binary models 
# target = 'per_exp' #'claim_per_exp', 'claim','per_exp','question','none'
# labels = ['claim_per_exp', 'claim','per_exp','question','none']

# for lab in labels:
#   if target != lab:
#     df['Label'] = df['Label'].str.replace(lab,'O')
#   else:
#     df['Label'] = df['Label'].str.replace(lab,'Arg')

# df

Unnamed: 0.1,Unnamed: 0,post_id,subreddit_id,Sentence,Label,Components
0,0,s1jpia,t5_2s23e,De-Nial\nI wrote this a few years ago and just...,O,{}
1,1,s1jpia,t5_2s23e,I thought I'd share...\n\n&#x200B;\n\nWhen I w...,O,{}
2,2,s1jpia,t5_2s23e,Like the opening line of the old Frank Sanatra...,O,{}
3,3,s1jpia,t5_2s23e,"I was getting ready to graduate high school, I...",O,{}
4,4,s1jpia,t5_2s23e,Growing up I was an only child with older pare...,O,{}
...,...,...,...,...,...,...
127354,127354,ri83g1,t5_2r876,Just how effective these drugs are and how do ...,O,Just how effective these drugs are and how do ...
127355,127355,ri83g1,t5_2r876,Edit2: Thank you.,O,{}
127356,127356,ri83g1,t5_2r876,It is wonderful that there are drugs that can ...,O,{}
127357,127357,ri83g1,t5_2r876,I am glad that Trikafta exists NOW.,O,{}


In [6]:
df_arg = df[df["Label"]=='Arg']
print(len(df_arg))
df_o = df[df["Label"]=='O']
print(len(df_o))

10207
117152


In [7]:
# can ignore this part if imbalanced data is needed

df_arg = df[df["Label"]=='Arg']
df_o = df[df["Label"]=='O']

df_o_top = df_o[:df_arg.shape[0]]

df_balanced = pd.concat([df_arg, df_o_top])
df_balanced = df_balanced.reset_index(drop=True)

df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
df_balanced

Unnamed: 0.1,Unnamed: 0,post_id,subreddit_id,Sentence,Label,Components
0,5264,r8odjm,t5_2syer,Ive been reading a lot about gout and the symp...,O,{}
1,10378,ri4gtr,t5_2qlaa,Taking L-theanine + Magnesium for panic as of ...,O,{}
2,43143,sdw3cv,t5_2saq9,im typing this with my arms up lightly above m...,Arg,. im typing this with my arms up lightly above...
3,10148,p6h1s6,t5_2r876,Please be diligent that you don't have molds g...,O,{}
4,29153,ppx0mb,t5_2syer,With both of my current gout issues the skin o...,Arg,With both of my current gout issues the skin o...
...,...,...,...,...,...,...
20409,4144,smi5pz,t5_2s3g1,This is my first post here.,O,{}
20410,864,p61l9a,t5_2syer,I've been drinking soymilk/oatmilk and rarely ...,O,{}
20411,77028,qcpiu7,t5_2rtve,I have a strange cycle of neuropathic symptoms...,Arg,I have a strange cycle of neuropathic symptoms...
20412,3799,s2opd4,t5_2s23e,I feel like I cant breathe while Im typing this.,O,{}


In [8]:
df_balanced["Label"].describe()

count     20414
unique        2
top           O
freq      10207
Name: Label, dtype: object

In [9]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_balanced, test_size=0.2)
df_value = df_balanced
print(df_train.shape, df_value.shape, df_test.shape)

(16331, 6) (20414, 6) (4083, 6)


In [11]:
df_test['Label'].value_counts()

O      2053
Arg    2030
Name: Label, dtype: int64

In [12]:
df_test['Label'].value_counts()
df_train['Label'].value_counts()

Arg    8177
O      8154
Name: Label, dtype: int64

In [13]:
df_test['Label'].value_counts()
df_train['Label'].value_counts()
df_value['Label'].value_counts()

O      10207
Arg    10207
Name: Label, dtype: int64

## Transformer Language Model

In [14]:
config = dict(
    transformer_model = dict(
        model = "chkla/roberta-argument",
        path_to_state_dict = False,
        device = 'cuda',
        dropout = 0.2,
        learning_rate = 2e-5,
        batch_size = 16,
        shuffle = True,
        maxlen = 128,
    ),
    data = dict(
        train_data_path = df_train,
        test_data_path = df_value,
        text_column = "Sentence",
        target_column = "Label",
        random_state = 52,
        test_size = 0.3,
        stratify=True
    ),
    training = dict (
    save_state_dict = False,
    early_stopping = True,
    delta = 0.001,
    patience = 7,
    num_epochs = 2,
    average_f1 = 'macro',
    other_metrics = ['micro', 'weighted'],
    output_dir = "../results/",
    class_weight = True
    )
)

In [15]:
set_global_seed(seed=config['data']['random_state'])
os.makedirs(config['training']['output_dir'], exist_ok=True)

In [16]:
device = torch.device(config['transformer_model']['device'])
tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=config['transformer_model']["model"]
    )
model_bert = AutoModel.from_pretrained(
    pretrained_model_name_or_path=config['transformer_model']["model"]
).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at chkla/roberta-argument were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at chkla/roberta-argument and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
id2label, train_texts, valid_texts, train_targets, valid_targets = prepare_data_notebook(
    config=config, train_df = df_train, test_df = df_value
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train.dropna(inplace=True)


In [18]:
id2label

{0: 'Arg', 1: 'O'}

In [19]:
model = BertCLF(
    pretrained_model=model_bert,
    tokenizer=tokenizer,
    id2label=id2label,
    dropout=config['transformer_model']['dropout'],
    device=device     
    )

In [20]:
model = model.to(device)

In [21]:

optimizer = optim.Adam(model.parameters(), lr=float(config['transformer_model']['learning_rate']))
criterion = nn.NLLLoss()

training_generator, valid_generator = prepare_dataset(
    tokenizer=tokenizer,
    train_texts=train_texts,
    train_targets=train_targets,
    valid_texts=valid_texts,
    valid_targets=valid_targets,
    config=config
)

In [22]:
model = train_evaluate(
    model=model,
    training_generator=training_generator,
    valid_generator=valid_generator,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=config['training']['num_epochs'],
    average=config['training']['average_f1'],
    config=config
)

==== Epoch 1 out of 2 ====


Training loop: 100%|██████████| 1021/1021 [01:39<00:00, 10.26it/s]
Evaluating loop: 100%|██████████| 1276/1276 [00:39<00:00, 32.18it/s]


Train F1: 0.6861194598987732
Eval F1: 0.7770953280728632

Train F1 micro: 0.703604977294987
Eval F1 micro: 0.7888840685176892

Train F1 weighted: 0.6989423772245509
Eval F1 weighted: 0.7886624214997021

==== Epoch 2 out of 2 ====


Training loop: 100%|██████████| 1021/1021 [01:36<00:00, 10.59it/s]
Evaluating loop: 100%|██████████| 1276/1276 [00:39<00:00, 32.44it/s]


Train F1: 0.7793255965047192
Eval F1: 0.8609923939663957

Train F1 micro: 0.7920265337013622
Eval F1 micro: 0.8691082624272279

Train F1 weighted: 0.7906802129156768
Eval F1 weighted: 0.8688967660114555




Computing final metrics...: 100%|██████████| 1276/1276 [00:34<00:00, 36.47it/s]


              precision    recall  f1-score   support

         Arg       0.88      0.85      0.87     10207
           O       0.86      0.88      0.87     10207

    accuracy                           0.87     20414
   macro avg       0.87      0.87      0.87     20414
weighted avg       0.87      0.87      0.87     20414



In [23]:
model.to('cpu')

BertCLF(
  (pretrained_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [24]:
preds = []
for i,j in zip(df_test['Sentence'], df_test['Label']):
    preds.append([model.predict(i), j, i])

In [25]:

pred = []
for i in preds:
    pred.append(i[0])

true = []
for m in preds:
    true.append(m[1])

In [26]:
from sklearn.metrics import classification_report

target_names = ['class 0', 'class 1']
print(classification_report(true, pred, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     class 0      0.823     0.769     0.795      2030
     class 1      0.785     0.836     0.810      2053

    accuracy                          0.803      4083
   macro avg      0.804     0.803     0.803      4083
weighted avg      0.804     0.803     0.803      4083

