In [1]:
import os 

os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Load kialo data from scratch 
#### (scroll down if want to use already processed kialo data)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [4]:
kialo_ds_path = '/mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation/multi-taks-counter-argument-generation/kialo_data/'

In [6]:
train_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_train_df.pkl')
valid_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_valid_df.pkl')
test_kialo_df = pd.read_pickle(kialo_ds_path + '/kialo_test_df.pkl')

In [5]:
def create_df(df):
    
    df = df.groupby('conclusion_text').agg({
        'premises': lambda x: list(x)[0],
        'counter' : lambda x: list(x)
    }).reset_index()
    
    output_data = []

    for idx, row in df.iterrows():
        for premise in row['premises']:
            num_tokens = len(premise.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], premise, 0))

        for counter in row['counter']:
            num_tokens = len(counter.split())
            if  num_tokens <= 200 and num_tokens > 3:
                output_data.append((row['conclusion_text'], counter, 1))

    output_df = pd.DataFrame(output_data, columns=['claim1', 'claim2', 'label'])
    
    #Balancing the dataframe
    g = output_df.groupby('label')
    output_df = pd.DataFrame(g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True)))
    
    return output_df

In [8]:
train_df = create_df(train_kialo_df)
valid_df = create_df(valid_kialo_df)
test_df  = create_df(test_kialo_df)

In [9]:
train_df.label.value_counts()

1    47832
0    47832
Name: label, dtype: int64

In [10]:
valid_df.label.value_counts()

1    3858
0    3858
Name: label, dtype: int64

In [11]:
test_df.label.value_counts()

1    11227
0    11227
Name: label, dtype: int64

In [12]:
train_df.to_csv('../data/kialo_stance_classification_training_data.csv', index=False)
test_df.to_csv('../data/kialo_stance_classification_test_data.csv', index=False)
valid_df.to_csv('../data/kialo_stance_classification_valid_data.csv', index=False)

## Load already processed kialo data for tokenization and training for model

In [6]:
from datasets import Dataset
from transformers import TrainingArguments, RobertaTokenizer, RobertaForSequenceClassification, TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AutoTokenizer, TextClassificationPipeline

import torch
from transformers import Trainer

In [35]:
import pandas as pd
train_df = pd.read_csv('../data/kialo_stance_classification_training_data.csv')
test_df  = pd.read_csv('../data/kialo_stance_classification_test_data.csv')
valid_df = pd.read_csv('../data/kialo_stance_classification_valid_data.csv')

### convert df into dataset

In [36]:
train_df['input_txt'] = train_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)
valid_df['input_txt'] = valid_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)


counter label is 1
support label is 0

In [37]:
train_df.sample(10).head(n=5)

Unnamed: 0,claim1,claim2,label,input_txt
87038,There is no grounds to assume evil would not be present in the best of all worlds. Best is relative.,"Most believers in an omniscient God also believe that God mandates objective moral standards of good and bad. So claiming that ""best"" is relative is not a viable defense for most believers in God.",1,"There is no grounds to assume evil would not be present in the best of all worlds. Best is relative. </s> Most believers in an omniscient God also believe that God mandates objective moral standards of good and bad. So claiming that ""best"" is relative is not a viable defense for most believers in God."
45013,The creation of a strengthened monopoly of power could harm those who have differing or opposing views to the central government. This is underlined by today's EU behavior.,The President of the European Commission Jean-Claude Juncker told European citizens that Britain will be treated as “deserters following a vote to leave the European Union. This is intimidation.,0,The creation of a strengthened monopoly of power could harm those who have differing or opposing views to the central government. This is underlined by today's EU behavior. </s> The President of the European Commission Jean-Claude Juncker told European citizens that Britain will be treated as “deserters following a vote to leave the European Union. This is intimidation.
38495,Difficulties with the formation of a religious identity may cause children of interfaith marriages to be pushed away from religion in general.,"A 2006 survey showed that 37% of those raised by parents of different religions reported weekly attendance at religious services, compared with 42% of those raised by parents with the same faith.",0,"Difficulties with the formation of a religious identity may cause children of interfaith marriages to be pushed away from religion in general. </s> A 2006 survey showed that 37% of those raised by parents of different religions reported weekly attendance at religious services, compared with 42% of those raised by parents with the same faith."
91872,Gambling causes problems for the individual.,The casino owners are also individuals whom are participating in gambling during their daily operations. These people benefit from gambling.,1,Gambling causes problems for the individual. </s> The casino owners are also individuals whom are participating in gambling during their daily operations. These people benefit from gambling.
92051,Religion is good for the psycho-social wellness of its followers.,"Religion usually requires prayer, and this is the part that has shown psychological usefulness. In reducing prayer to a psychological technique of objective visualisation, you remove the need for god. This kind of prayer can work without the use of god, for example the placebo effect, or sports visualisation techniques. So these ideas show that we don't need to pray to God in order for the the techniques to work.",1,"Religion is good for the psycho-social wellness of its followers. </s> Religion usually requires prayer, and this is the part that has shown psychological usefulness. In reducing prayer to a psychological technique of objective visualisation, you remove the need for god. This kind of prayer can work without the use of god, for example the placebo effect, or sports visualisation techniques. So these ideas show that we don't need to pray to God in order for the the techniques to work."


In [9]:
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df.sample(1000))

## Apply Roberta model

In [7]:
import numpy as np
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
metric = load_metric("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = metric.compute(predictions=preds, references=labels)
    return {
        'accuracy': acc['accuracy'],
        'f1': f1,
    }

In [11]:
# tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# model = RobertaForSequenceClassification.from_pretrained('roberta-large').cuda()
model = AutoModelForSequenceClassification.from_pretrained('roberta-base',num_labels=2).cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [12]:
tokenized_train = train_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)
tokenized_valid = valid_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)

  0%|          | 0/96 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [22]:
# training_args = TrainingArguments('../data/output/stance_classification', 
#                                   evaluation_strategy="epoch", 
#                                   eval_steps=1000,
#                                   save_steps=4000,
#                                   learning_rate=2e-5,
#                                   weight_decay=0.01,
#                                   save_total_limit=5,
#                                   num_train_epochs=10 , 
#                                   per_device_train_batch_size=8)


args = TrainingArguments(
    # output_dir: directory where the model checkpoints will be saved.
    output_dir='../data/output/stance_classification',
    # evaluation_strategy (default "no"):
    # Possible values are:
    # "no": No evaluation is done during training.
    # "steps": Evaluation is done (and logged) every eval_steps.
    # "epoch": Evaluation is done at the end of each epoch.
    evaluation_strategy="steps",
    # eval_steps: Number of update steps between two evaluations if
    # evaluation_strategy="steps". Will default to the same value as
    # logging_steps if not set.
    eval_steps=200,
    # logging_strategy (default: "steps"): The logging strategy to adopt during
    # training (used to log training loss for example). Possible values are:
    # "no": No logging is done during training.
    # "epoch": Logging is done at the end of each epoch.
    # "steps": Logging is done every logging_steps.
    logging_strategy="steps",
    # logging_steps (default 500): Number of update steps between two logs if
    # logging_strategy="steps".
    logging_steps=200,
    # save_strategy (default "steps"):
    # The checkpoint save strategy to adopt during training. Possible values are:
    # "no": No save is done during training.
    # "epoch": Save is done at the end of each epoch.
    # "steps": Save is done every save_steps (default 500).
    save_strategy="steps",
    # save_steps (default: 500): Number of updates steps before two checkpoint
    # saves if save_strategy="steps".
    save_steps=600,
    # learning_rate (default 5e-5): The initial learning rate for AdamW optimizer.
    # Adam algorithm with weight decay fix as introduced in the paper
    # Decoupled Weight Decay Regularization.
    learning_rate=2e-5,
    # per_device_train_batch_size: The batch size per GPU/TPU core/CPU for training.
    per_device_train_batch_size=64,
    # per_device_eval_batch_size: The batch size per GPU/TPU core/CPU for evaluation.
    per_device_eval_batch_size=64,
    # num_train_epochs (default 3.0): Total number of training epochs to perform
    # (if not an integer, will perform the decimal part percents of the last epoch
    # before stopping training).
    num_train_epochs=3,
    # load_best_model_at_end (default False): Whether or not to load the best model
    # found during training at the end of training.
    load_best_model_at_end=True,
    # metric_for_best_model:
    # Use in conjunction with load_best_model_at_end to specify the metric to use
    # to compare two different models. Must be the name of a metric returned by
    # the evaluation with or without the prefix "eval_".
    metric_for_best_model="f1",
    # report_to:
    # The list of integrations to report the results and logs to. Supported
    # platforms are "azure_ml", "comet_ml", "mlflow", "tensorboard" and "wandb".
    # Use "all" to report to all integrations installed, "none" for no integrations.
#     report_to="tensorboard"
)


# trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_valid, compute_metrics=compute_metrics)



trainer = Trainer(
    # Function that returns the model to train. It's useful to use a function
    # instead of directly the model to make sure that we are always training
    # an untrained model from scratch.
    model=model,
    # The training arguments.
    args=args,
    # The training dataset.
    train_dataset=tokenized_train,
    # The evaluation dataset. We use a small subset of the validation set
    # composed of 150 samples to speed up computations...
    eval_dataset=tokenized_valid.shuffle(42),#.select(range(150)),
    # Even though the training set and evaluation set are already tokenized, the
    # tokenizer is needed to pad the "input_ids" and "attention_mask" tensors
    # to the length managed by the model. It does so one batch at a time, to
    # use less memory as possible.
    tokenizer=tokenizer,
    # Function that will be called at the end of each evaluation phase on the whole
    # arrays of predictions/labels to produce metrics.
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: claim2, claim1, input_txt. If claim2, claim1, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 95664
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 4485


Step,Training Loss,Validation Loss,Accuracy,F1
200,0.1871,0.744419,0.78,0.779116


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: claim2, claim1, input_txt, __index_level_0__. If claim2, claim1, input_txt, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


In [None]:
trainer.save_model('../data/output/stance_classification/best_model')

In [25]:
model = AutoModelForSequenceClassification.from_pretrained('../data/output/stance_classification/best_model').cuda()

loading configuration file ../data/output/stance_classification/best_model/config.json
Model config RobertaConfig {
  "_name_or_path": "../data/output/stance_classification/best_model",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file ../data/output/stance_classification/best_model/pytorch_model.bin
All model checkpoint weights

In [26]:
test_df['input_txt'] = test_df.apply(lambda x: x['claim1'] + ' </s> ' + x['claim2'], axis=1)
test_dataset = Dataset.from_pandas(test_df)
tokenized_test = test_dataset.map(lambda a: tokenizer(a['input_txt'], padding='max_length', max_length=256, truncation=True),batched=True)
pred=trainer.predict(tokenized_test)

  0%|          | 0/23 [00:00<?, ?ba/s]

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: claim2, claim1, input_txt. If claim2, claim1, input_txt are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 22454
  Batch size = 64


In [27]:
scores = np.argmax(pred.predictions, axis=1)

In [29]:
precision_recall_fscore_support(tokenized_test['label'], scores, average='binary')

(0.8248618784530387, 0.7978979246459428, 0.8111558835514103, None)

### Testing on Reddit:

In [8]:
#Loading the model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('../data/output/stance_classification/best_model').cuda()
arg_stance_pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, framework='pt', task='ArgQ', device=0)

In [9]:
#Loading the data
data_fold = '../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/'
train_df = pd.read_pickle(data_fold+'/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl')

In [16]:
#create testing sample
sample_df = train_df.sample(1000)

In [17]:
opposing_claims = [(x, 1) for x in sample_df.apply(lambda x: x['title'] + ' </s> ' + x['counter'], axis=1).tolist()]
supporting_claims = [(x, 0) for x in sample_df.apply(lambda x: x['title'] + ' </s> ' + ' '.join(x['post']), axis=1).tolist()]

In [18]:
all_claims = opposing_claims + supporting_claims
all_claims_texts, all_claims_stances = zip(*all_claims)

In [19]:
all_claims_pred_stances = arg_stance_pipeline(list(all_claims_texts), truncation=True)
all_claims_pred_stances = [int(x['label'].split('_')[-1]) for x in all_claims_pred_stances]

In [20]:
p, r, f, _ = precision_recall_fscore_support(all_claims_stances, all_claims_pred_stances, labels=[0,1], average='binary')

In [21]:
print('Prec: {}'.format(p))
print('Rec: {}'.format(r))
print('F1: {}'.format(f))

Prec: 0.6292388847023361
Rec: 0.835
F1: 0.7176622260421144
