# Install Requirements

In [None]:
!pip install transformers==4.28.1 datasets

# Connect to Drive

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import glob
from datasets import load_dataset
from datasets import Dataset

# Load Corpus

In [4]:
import pickle

f = open("/content/drive/MyDrive/Corpus/CG_Corpus/cg_3to1_2previous_event_selection.dat", "rb")
dataset = pickle.load(f)
f.close()

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 970
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 325
    })
})

# Paraphrase corpus with translation using flan t5


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [None]:
model.to('cuda')

## Example

In [8]:
event = """i am a student"""

In [9]:
inputs = tokenizer(f"translate English to French: {event}", return_tensors="pt").to('cuda')
outputs = model.generate(**inputs, max_length=256)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

inputs = tokenizer(f"translate English to German: {event}", return_tensors="pt").to('cuda')
outputs = model.generate(**inputs, max_length=256)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

inputs = tokenizer(f"translate English to Spanish: {event}", return_tensors="pt").to('cuda')
outputs = model.generate(**inputs, max_length=256)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

inputs = tokenizer(f"translate English to Italian: {event}", return_tensors="pt").to('cuda')
outputs = model.generate(**inputs, max_length=256)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Je suis un étudiant']
['Ich bin ein Student']
['yo soy un estudiante']
['io è un studente']


## Paraphrasing on minority classes (CT-, PS, NB)

### old prompt

In [10]:
sum_paraphrased = 0
for record in dataset['train']:
  if record['Bel(A)']==2:
    event = str(record['Event']).strip()
    input = event

    # 1. French
    prompt_text = f"""translate English to French: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 2. German
    prompt_text = f"""translate English to German: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 3. Spanish
    prompt_text = f"""translate English to Spanish: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 4. Italian
    # prompt_text = f"""translate English to Italian: {input}"""
    # inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=256, return_tensors='pt').to('cuda')
    # outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=256, num_beams=4, early_stopping=True)
    # prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # output = prediction
    # new_record = record.copy()
    # new_record['Event'] = output
    # # add paraphrased event
    # dataset_dict = dataset["train"].to_dict()
    # for key in new_record: dataset_dict[key].append(new_record[key])
    # dataset["train"] = Dataset.from_dict(dataset_dict)

    sum_paraphrased += 1

print(f"Paraphrased sentences: {sum_paraphrased}")

Paraphrased sentences: 54


In [11]:
sum_paraphrased = 0
for record in dataset['train']:
  if record['Bel(A)']==3:
    event = str(record['Event']).strip()
    input = event

    # 1. French
    prompt_text = f"""translate English to French: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 2. German
    prompt_text = f"""translate English to German: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 3. Spanish
    prompt_text = f"""translate English to Spanish: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 4. Italian
    # prompt_text = f"""translate English to Italian: {input}"""
    # inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=256, return_tensors='pt').to('cuda')
    # outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=256, num_beams=4, early_stopping=True)
    # prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # output = prediction
    # new_record = record.copy()
    # new_record['Event'] = output
    # # add paraphrased event
    # dataset_dict = dataset["train"].to_dict()
    # for key in new_record: dataset_dict[key].append(new_record[key])
    # dataset["train"] = Dataset.from_dict(dataset_dict)

    sum_paraphrased += 1

print(f"Paraphrased sentences: {sum_paraphrased}")

Paraphrased sentences: 78


In [12]:
sum_paraphrased = 0
for record in dataset['train']:
  if record['Bel(A)']==4:
    event = str(record['Event']).strip()
    input = event

    # 1. French
    prompt_text = f"""translate English to French: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 2. German
    prompt_text = f"""translate English to German: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 3. Spanish
    prompt_text = f"""translate English to Spanish: {input}"""
    inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=512, return_tensors='pt').to('cuda')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = prediction
    new_record = record.copy()
    new_record['Event'] = output
    # add paraphrased event
    dataset_dict = dataset["train"].to_dict()
    for key in new_record: dataset_dict[key].append(new_record[key])
    dataset["train"] = Dataset.from_dict(dataset_dict)

    # 4. Italian
    # prompt_text = f"""translate English to Italian: {input}"""
    # inputs = tokenizer.encode_plus(prompt_text, padding='max_length', max_length=256, return_tensors='pt').to('cuda')
    # outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=256, num_beams=4, early_stopping=True)
    # prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # output = prediction
    # new_record = record.copy()
    # new_record['Event'] = output
    # # add paraphrased event
    # dataset_dict = dataset["train"].to_dict()
    # for key in new_record: dataset_dict[key].append(new_record[key])
    # dataset["train"] = Dataset.from_dict(dataset_dict)

    sum_paraphrased += 1

print(f"Paraphrased sentences: {sum_paraphrased}")

Paraphrased sentences: 38


# Save Augmented Corpus with Translation

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 1480
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 325
    })
})

In [14]:
import pickle

f = open("/content/drive/MyDrive/Corpus/CG_Corpus/cg_3to1_2previous_event_selection_aug.dat", "wb")
pickle.dump(dataset, f)

In [15]:
import pickle

f = open("/content/drive/MyDrive/Corpus/CG_Corpus/cg_3to1_2previous_event_selection_aug.dat", "rb")
dataset = pickle.load(f)
f.close()

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 1480
    })
    test: Dataset({
        features: ['Speaker', 'Sentence_Number', 'Sentence', 'Event', 'Target_Event', 'Bel(A)', 'Bel(B)', 'CG(A)', 'CG(B)'],
        num_rows: 325
    })
})

In [None]:
# cg_3to1_2previous_event_selection_aug
# cg_3to1_previous_speaker_base_event_selection_aug