In [6]:
import json
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer, pipeline, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, TextClassificationPipeline, Seq2SeqTrainer, BertForSequenceClassification
from datasets import Dataset
from tqdm import tqdm
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [16]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')

In [4]:
conclusion_gen_tokenizer = AutoTokenizer.from_pretrained("../../data-ceph/arguana/arg-generation/conclusion-generation-models/dbart")
conclusion_gen_model = AutoModelForSeq2SeqLM.from_pretrained("../../data-ceph/arguana/arg-generation/conclusion-generation-models/dbart").to(device)

In [17]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def generate_conclusion(premises, gen_kwargs, batch_size=16):
    if type(premises[0]) == list:
        premises = [' '.join(x) for x in premises]
    
    ds = Dataset.from_dict({'premises': premises})
    ds = ds.map(lambda x :conclusion_gen_tokenizer(x['premises'], max_length=512, truncation=True, padding='max_length') , batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    dataloader = torch.utils.data.DataLoader(ds, batch_size=batch_size)

    generated_conclusion = []

    conclusion_gen_model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            generated_tokens = conclusion_gen_model.generate(
                input_ids,
                attention_mask=attention_mask,
                **gen_kwargs
            )

            generated_tokens = generated_tokens.cpu().numpy()

            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]

            decoded_preds = conclusion_gen_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

            generated_conclusion += decoded_preds

    return generated_conclusion

In [18]:
gen_kwargs = {
    "do_sample": False, 
    #"min_length":20,
    "top_p":0.90, 
    "top_k":50,
    "num_beams":20,
    "num_beam_groups":5,
    "diversity_penalty":0.3,
    "num_return_sequences":5
}

conclusions = generate_conclusion(taska_training_df.Premise.tolist(), gen_kwargs, batch_size=8)
taska_training_df['gen_conclusions'] = list(chunks(conclusions, 5))

  0%|          | 0/1 [00:00<?, ?ba/s]

  next_indices = next_tokens // vocab_size
  num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
100%|██████████| 94/94 [01:21<00:00,  1.15it/s]


In [20]:
taska_training_df.sample(10).head()

Unnamed: 0,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence,gen_conclusions
233,Trying terrorist suspects in civilian courts,"Which brings us to the idea that allowing Mr. Mohammed to take the stand will give him a soapbox. The truth is, if the trial provides a propaganda platform for anybody, it will be for our side. [...] First, federal courts do not permit TV cameras in the courtroom, so the opportunity for “real time” jihadist propagandizing won’t exist.",There will be no TV cameras in trials of terror suspects.,1,majority,-1,very confident,"[No TV cameras should be allowed in trials of terror suspects, No TV cameras should be allowed in trials of terror suspects., No TV cameras can be allowed in the courtroom for terror suspects., No TV cameras should be allowed in the courtroom for terror suspects., No TV cameras should be allowed in the courtroom for the terror suspects.]"
96,Trying 9/11 terror suspects in NYC courts,"The politics of the trial stems from Obama’s decision to announce the closing of Guantanamo before he had lined up takers for its prisoners. Now with unemployment at 10.2 percent, he could hold an auction for the officials in Illinois, Montana, Michigan and Colorado clamoring for the detainees to fill their underused prisons. The politician who wins the detainees isn’t going to get booted from office. He’s going to get a parade.",Detainees can be sold to fill empty prisons,1,very confident,-1,majority,"[The detainees will have nothing to do with politics at all., The detainees will have nothing to do with politics., The prisoner will have nothing to do with politics or the prison system., The detainees will have nothing to do with politics or the economy., The prisoner will have nothing to do with politics or the economy.]"
13,TV viewing is harmful to children,"Television has become a temple of mass production, shallow values and stereotypes that have a great influence on modern society. This negative effect spreads with the growing popularity of TV, especially among young people and children. It defiantly changes our society for the worse, trivialising culture and making us all conform to a bland, ""Hollywood"" model of entertainment in which regional traditions and diversity are lost.",Television is a temple of shallow values,1,very confident,-1,very confident,"[Television is mostly directed to films and shows that have commercial purposes but little cultural value:, Television is mostly directed to films and shows that have commercial purposes but little cultural value., Television is mostly directed to films and shows that have commercial purposes but little cultural value:, Television is mostly directed to films and shows which have commercial purposes but little cultural value:, Television is mostly directed to films and shows that have commercial purposes but little cultural value.]"
30,Torture,"Torture puts the torturer in a position of dominance and abuse that has a brutalizing effect. This brutalizing effect is dehumanizing, or at least it defeats the virtues of compassion, empathy, and dignity that define a good human being, perhaps in God's image.","Torture has a brutalizing, dehumanizing effect",1,very confident,-1,very confident,"[Torture is ineffective at getting prisoners to reveal desired information., A painful execution, which can be regarded as torture, damages the nation's international reputation., It is morally acceptable for prisons to focus on rehabilitation., Torture is immoral because it violates human rights and dignity., A painful execution, which can be regarded as torture, damages the nation's international international reputation.]"
131,Trying 9/11 terror suspects in NYC courts,"As for emotional pain, one of the advances in the U.S. criminal justice system is giving victims a place at trial and a chance to be heard. The families wouldn’t get that in Guantanamo.",Obama administration consulted with Congress on Libya,-1,very confident,-1,very confident,"[Torture is ineffective in getting prisoners to reveal desired information., Torture can cause psychological, material and sometimes physical pain for the detainee., Torture can cause psychological, material and sometimes physical pain for victims., Torture can cause psychological, material and sometimes physical pain for the prisoners., Torture can cause psychological, material and sometimes physical pain for victims.]"


In [23]:
output_data = []
for idx, row in taska_training_df.iterrows():
    output_data.append((row['topic'], row['Premise'], row['Conclusion'], row['Validity'], row['Validity-Confidence'], row['Novelty'], row['Novelty-Confidence']))
    for conc in row['gen_conclusions']:
        output_data.append((row['topic'], row['Premise'], conc, 1, 'Low', -1, 'Low'))
new_df = pd.DataFrame(output_data, columns=['topic', 'Premise', 'Conclusion', 'Validity', 'Validity-Confidence', 'Novelty', 'Novelty-Confidence'])

In [24]:
new_df.head()

Unnamed: 0,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence
0,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Depression is a well-known psychological problem of modern society that is partly caused by TV watching:,1,confident,1,confident
1,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
2,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
3,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
4,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",The constant comparisons of different cultures and events in popular media can [exacerbate]( these contradictions.,1,Low,-1,Low


In [29]:
taska_training_df.Validity.value_counts()

 1    401
-1    320
 0     29
Name: Validity, dtype: int64

In [28]:
new_df.Validity.value_counts()

 1    4151
-1     320
 0      29
Name: Validity, dtype: int64

In [30]:
new_df.to_pickle('../data/TaskA_train_with_extra_conclusions.pkl') 