In [1]:
import json
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer, pipeline, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, TextClassificationPipeline, Seq2SeqTrainer, BertForSequenceClassification
from datasets import Dataset
from tqdm import tqdm
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
taska_training_df = pd.read_csv('../data/TaskA_train.csv')

In [4]:
conclusion_gen_tokenizer = AutoTokenizer.from_pretrained("../../data-ceph/arguana/arg-generation/conclusion-generation-models/dbart")
conclusion_gen_model = AutoModelForSeq2SeqLM.from_pretrained("../../data-ceph/arguana/arg-generation/conclusion-generation-models/dbart").to(device)

In [5]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def generate_conclusion(premises, gen_kwargs, batch_size=16):
    if type(premises[0]) == list:
        premises = [' '.join(x) for x in premises]
    
    ds = Dataset.from_dict({'premises': premises})
    ds = ds.map(lambda x :conclusion_gen_tokenizer(x['premises'], max_length=512, truncation=True, padding='max_length') , batched=True)
    ds.set_format(type='torch', columns=['input_ids', 'attention_mask'])
    dataloader = torch.utils.data.DataLoader(ds, batch_size=batch_size)

    generated_conclusion = []

    conclusion_gen_model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            generated_tokens = conclusion_gen_model.generate(
                input_ids,
                attention_mask=attention_mask,
                **gen_kwargs
            )

            generated_tokens = generated_tokens.cpu().numpy()

            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]

            decoded_preds = conclusion_gen_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

            generated_conclusion += decoded_preds

    return generated_conclusion

In [6]:
gen_kwargs = {
    "do_sample": False, 
    #"min_length":20,
    "top_p":0.90, 
    "top_k":50,
    "num_beams":20,
    "num_beam_groups":5,
    "diversity_penalty":0.3,
    "num_return_sequences":5
}

conclusions = generate_conclusion(taska_training_df.Premise.tolist(), gen_kwargs, batch_size=8)
taska_training_df['gen_conclusions'] = list(chunks(conclusions, 5))



  0%|          | 0/1 [00:00<?, ?ba/s]

  next_indices = next_tokens // vocab_size
  num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
100%|██████████| 94/94 [01:21<00:00,  1.15it/s]


In [7]:
taska_training_df.sample(10).head()

Unnamed: 0,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence,gen_conclusions
211,Trying terrorist suspects in civilian courts,"Sen. John McCain, R-Ariz., compared the decision to try accused 9/11 mastermind Khalid Sheikh Mohammed and other terrorists in New York City to trying Nazi war criminal Hermann Goering in San Francisco. ""It's ridiculous. These are war criminals and terrorists and they should not be privy to regular courtroom procedures.",Justice Department Defense 1/27/06,-1,very confident,-1,very confident,"[War criminals should not be tried in civilian courts., War criminals should not be tried in civilian courts., War criminals should not be tried in civilian courts, War criminals should not have to trial in civilian courts., War criminals should not have to trial in civilian courts.]"
680,Unilateral US military strike inside Pakistan,"Pakistan has their own version of the PAL system that the U.S. uses to secure weapons. In fact the Pakistani warheads are most likely safer than the Russian nuclear missiles. Several government officials, and generals have reported from Pakistan firsthand that the weapons are secured.",Pakistan has a better nuclear weapon system than Russia,1,confident,0,defeasible,"[Pakistan has its own nuclear weapons. The US should not be worried about them., Pakistan has its own nuclear weapons and is prepared for a war against Russia., Pakistan has its own nuclear weapons and is therefore capable of defending itself against the US., Pakistan has its own nuclear weapons and is therefore capable of defending itself against the United States., Pakistan has its own nuclear weapons. The US should not be concerned about its nuclear capabilities.]"
632,US-Indian nuclear deal,"Blackwill said, ""Some experts say this apparent U.S. favoritism toward India could increase the nuclear rivalry between the intensely competitive nations, and potentially raise tensions in the already dangerous region. 'My impression is that [the Pakistanis] are worried this will feed the Indian nuclear weapons program and therefore weaken deterrence.' Other experts say the two countries, both admittedly now nuclear, could be forced to deal more cautiously with each other.",US-Indian nuclear deal could weaken Pakistan's nuclear program,1,majority,-1,majority,"[US-India nuclear relations will weaken US-Nuclear deterrence, US-India nuclear relations will weaken US-Nuclear deterrence., US-India nuclear relations will weaken US-Nuclear deterrence:, US-India nuclear relations will weaken US-India deterrence, US-India nuclear relations will weaken US-India deterrence.]"
493,US health care reform,No bill this unpopular has ever before passed on a straight party-line vote. We're in a new political world. I'm not sure I understand it.,Two-party systems are more stable:,-1,very confident,-1,very confident,"[I don't think Obamacare is that bad., I don't think Obamacare is that bad., I don't think Obamacare is that big of a deal., I don't think Obamacare is that bad., I don't think Obamacare is that bad,]"
171,Trying terrorist suspects in civilian courts,"there is no longer much difference between a military and civilian trial. After the U.S. Supreme Court weighed in on the shortcomings of George W. Bush’s tribunals, Congress and the Obama administration granted more rights to the accused."" Therefore, trying terrorists in civilian courts is little different, and not more risky.",Terrorist suspects can be tried in civilian courts,1,majority,-1,very confident,"[Trying terrorists in civilian courts is no different than civilian trials, Trying terrorists in civilian courts is no different than civilian trials., Trying terrorists in civilian courts is no worse than trying terrorists in military courts, Trying terrorists in civilian courts is no worse than trying terrorists in military cases, Trying terrorists in civilian courts is no different than trying terrorists in military cases]"


In [8]:
output_data = []
for idx, row in taska_training_df.iterrows():
    output_data.append((row['topic'], row['Premise'], row['Conclusion'], row['Validity'], row['Validity-Confidence'], row['Novelty'], row['Novelty-Confidence']))
    for conc in row['gen_conclusions']:
        output_data.append((row['topic'], row['Premise'], conc, 1, 'Low', -1, 'Low'))
new_df = pd.DataFrame(output_data, columns=['topic', 'Premise', 'Conclusion', 'Validity', 'Validity-Confidence', 'Novelty', 'Novelty-Confidence'])

In [9]:
new_df.head()

Unnamed: 0,topic,Premise,Conclusion,Validity,Validity-Confidence,Novelty,Novelty-Confidence
0,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",Depression is a well-known psychological problem of modern society that is partly caused by TV watching:,1,confident,1,confident
1,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
2,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
3,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",This phenomenon is partly caused by TV watching:,1,Low,-1,Low
4,TV viewing is harmful to children,"The popularity of TV watching is among the reasons of this phenomenon. Violence, aggression, crimes and wars are broadcast through the daily news as well as in movies, showing dark pictures that encourage psychological tension, pessimism and negative emotions.",The constant comparisons of different cultures and events in popular media can [exacerbate]( these contradictions.,1,Low,-1,Low


In [10]:
taska_training_df.Validity.value_counts()

 1    401
-1    320
 0     29
Name: Validity, dtype: int64

In [11]:
new_df.Validity.value_counts()

 1    4151
-1     320
 0      29
Name: Validity, dtype: int64

In [12]:
new_df.to_pickle('../data/TaskA_train_with_extra_conclusions.pkl') 