In [None]:
!pip install simpletransformers

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'
torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
model_dataset = input('Dataset to train model (mix, twit0.825 or combined):  ')
training_type = input('Regular or limited training (regular or limited): ')
epochs = int(input('Number of training epochs: '))

testing_set = input('Dataset for prediction generation (mix, twit0.825, or combined): ')

if training_type.lower() == 'regular':
    train_set = 'training'
    eval_set = 'testing'
elif training_type.lower() == 'limited':
    train_set = 'testing'
    eval_set = 'training'
else:
    print('Please enter a valid training type')
    
emo_filter_list = [
#                   't5',
                   'bart',
#                   'gpt', 
#                   'nil',
#                   'sid',
#                   'sid_rg', 
                   'emo', 
                   'emo_ge',
#                   'emo_nn',
#                   'emo_sid', 
#                   'emo_sid_nn',
#                   'emo_sid_tg', 
#                   'emo_sid_tg_nn', 
#                   'emo_sid_tg_ge', 
#                   'emo_sid_tg_nn_ge'
                  ]

In [None]:
# Generate predictions with fine-tuned T5 models 
import pandas as pd
import os
from simpletransformers.t5 import T5Model

def genPredictions(model_emo_filter, eval_emo_filter):
    if (model_emo_filter == 't5'): 
        model_name = 't5-base' 
    else:
        model_name = f"{training_type}-{model_dataset}-{model_emo_filter}-{epochs}epochs"
    print(f'---- Generating Predictions for Model: {model_name}  on Dataset: {testing_set}-{eval_emo_filter}----')
    model_args = {
        "overwrite_output_dir": True,
        "max_seq_length": 196,
        "eval_batch_size": 32,
        "num_train_epochs": 1,
        "use_multiprocessing": False,
        "num_beams": None,
        "do_sample": True,
        "max_length": 50,
        "top_k": 50,
        "top_p": 0.95,
        "num_return_sequences": 3,
    }

    model = T5Model("t5", model_name, args=model_args)
    df = pd.read_csv(f'emotion-labeled-data/{model_dataset}/{testing_set}-{eval_emo_filter}-{eval_set}_t5.tsv', sep="\t").astype(str)

    if (model_emo_filter == 't5'): 
        to_predict = [
#            "Paraphrase: " + str(input_text)
            prefix + ": " + str(input_text)
            for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
        ]
    else:
        to_predict = [
            prefix + ": " + str(input_text)
            for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
        ]
        
    preds = model.predict(to_predict)
    preds = [pred[0] for pred in preds]

    if (model_emo_filter == 't5'): 
        new_preds = []
        for pred in preds: 
            new_pred = pred[pred.rfind(':')+1:]
            new_preds.append(new_pred)
        preds = new_preds 
        
    df["predictions"] = preds
    df = df.drop(['Unnamed: 0'], axis=1)

    path = f'predictions-data/{model_name}'
    if not os.path.exists(path):
        os.mkdir(path)

    df.to_csv(f'predictions-data/{model_name}/{testing_set}-{eval_emo_filter}-predictions.tsv', sep="\t")
    
    
for emo_filter in emo_filter_list:
    if (emo_filter == 't5'): 
#        genPredictions('t5', 'nil')
        genPredictions('t5', 'emo')
        genPredictions('t5', 'emo_ge')
#        genPredictions('nil', 'emo_ge')
    else: 
        genPredictions(emo_filter, emo_filter)

In [None]:
genPredictions('nil', 'emo_ge')

In [None]:
genPredictions('emo', 'emo_sid_tg_nn')
genPredictions('emo_ge', 'emo_sid_tg_nn_ge')

In [None]:
genPredictions('emo', 'emo')
genPredictions('emo_ge', 'emo_ge')
genPredictions('t5', 'emo')
genPredictions('t5', 'emo_ge')

In [None]:
genPredictions('t5', 'emo_sid_tg_nn_ge')

In [None]:
genPredictions('emo', 'emo_sid_tg_nn') 
genPredictions('emo_nn', 'emo_sid_tg_nn') 
genPredictions('emo_sid', 'emo_sid_tg_nn') 
genPredictions('emo_sid_nn', 'emo_sid_tg_nn') 
genPredictions('emo_sid_tg', 'emo_sid_tg_nn') 
genPredictions('emo_sid_tg_nn', 'emo_sid_tg_nn') 
genPredictions('emo_sid_tg_ge', 'emo_sid_tg_nn_ge') 
genPredictions('emo_sid_tg_nn_ge', 'emo_sid_tg_nn_ge') 

In [None]:
# Generate predictions with fine-tuned Bart models 
import pandas as pd
import os
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

def genPredictions(model_emo_filter, eval_emo_filter):
    if (model_emo_filter == 'bart'): 
        model_name = "facebook/bart-base" 
    else:
        model_name = f"{training_type}-{model_dataset}-{model_emo_filter}-{epochs}epochs-bart"
    print(f'---- Generating Predictions for Model: {model_name}  on Dataset: {testing_set}-{eval_emo_filter}----')
    model_args = {
        "overwrite_output_dir": True,
        "max_seq_length": 196,
        "eval_batch_size": 32,
        "num_train_epochs": 1,
        "use_multiprocessing": False,
        "num_beams": None,
        "do_sample": True,
        "max_length": 50,
        "top_k": 50,
        "top_p": 0.95,
        "num_return_sequences": 3,
    }

    model = Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name=model_name,
        args=model_args,
    )
    df = pd.read_csv(f'emotion-labeled-data/{model_dataset}/{testing_set}-{eval_emo_filter}-{eval_set}_t5.tsv', sep="\t").astype(str)

    if (model_emo_filter == 'bart'): 
        to_predict = [
#            "Paraphrase: " + str(input_text)
            prefix + ": " + str(input_text)
            for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
        ]
    else:
        to_predict = [
            prefix + ": " + str(input_text)
            for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
        ]
        
    preds = model.predict(to_predict)
    preds = [pred[0] for pred in preds]

#    if (model_emo_filter == 'bart'): 
    new_preds = []
    for pred in preds: 
         new_pred = pred[pred.rfind(':')+1:]
         new_preds.append(new_pred)
    preds = new_preds 
        
    df["predictions"] = preds
    df = df.drop(['Unnamed: 0'], axis=1)

    path = f'predictions-data/{model_name}'
    if not os.path.exists(path):
        os.makedirs(path)

    df.to_csv(f'predictions-data/{model_name}/{testing_set}-{eval_emo_filter}-predictions.tsv', sep="\t")
    
    
for emo_filter in emo_filter_list:
    if (emo_filter == 'bart'): 
#       genPredictions('bart', 'nil')
        genPredictions('bart', 'emo')
        genPredictions('bart', 'emo_ge')
#       genPredictions('nil', 'emo_ge')
    else: 
        genPredictions(emo_filter, emo_filter)

In [None]:
# Generate predictions with fine-tuned GPT2 models 
import pandas as pd
import os
from transformers import pipeline, AutoModelWithLMHead, AutoTokenizer

def clean_paraphrased(prefix, input_text):
    outputs = generator('<s>'+ prefix +': ' + input_text + '</s>>>>><p>', max_length=100, num_return_sequences=2, return_full_text=True)
    return outputs[0]['generated_text'].split('</s>>>>><p>')[1].split('</p>')[0]

def genGPT(model_emo_filter, eval_emo_filter):
    if (model_emo_filter == 'gpt'): 
        model_name = "gpt" 
    else:
        model_name = f"{training_type}-{model_dataset}-{model_emo_filter}-{epochs}epochs-gpt"
    print(f'---- Generating Predictions for Model: {model_name}  on Dataset: {testing_set}-{eval_emo_filter}----')
    df = pd.read_csv(f'emotion-labeled-data/{model_dataset}/{testing_set}-{eval_emo_filter}-{eval_set}_t5.tsv', encoding='utf-8', sep="\t")
    
    predictions = []
    
    for index, row in df.iterrows():
        if index % 1000 == 0:
            print('Row: ' + str(index))
            
        prefix = row.prefix
        input_text = row.input_text
        predictions.append(clean_paraphrased(prefix, input_text))
        
    df["predictions"] = predictions
    df = df.drop(['Unnamed: 0'], axis=1)
    
    path = f'predictions-data/{model_name}'
    if not os.path.exists(path):
        os.mkdir(path)
    
    df.to_csv(f'predictions-data/{model_name}/{testing_set}-{eval_emo_filter}-predictions.tsv', sep="\t")
    
for emo_filter in emo_filter_list:
    if (emo_filter == 'gpt'):
        model = AutoModelWithLMHead.from_pretrained("gpt2")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        generator = pipeline(task='text-generation', model=model, tokenizer=tokenizer, device=0)
        genGPT(emo_filter, 'emo')
        genGPT(emo_filter, 'emo_ge')
    else: 
        model = AutoModelWithLMHead.from_pretrained(f"{training_type}-{model_dataset}-{emo_filter}-{epochs}epochs-gpt")
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        generator = pipeline(task='text-generation', model=model, tokenizer=tokenizer, device=0)
        genGPT(emo_filter, emo_filter)