In [1]:
# Installing simpletransformers here 
# because of conflicting transformer version needed by GoEmotions
!pip install simpletransformers





[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Activates CUDA for GPU use by T5 model prediction generation
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'
torch.multiprocessing.set_sharing_strategy('file_system')

In [3]:
# Allows user input for selecting the model used for paraphrase generation based on:
# 1. Data set used to train the model
# 2. Training type of the model
# 3. Epochs used to train the model
# Allows user to specify the evaluation data set 
model_dataset = input('Dataset used to train model (mix, twit0.825, or combined): ')
training_type = input('Training type of model (regular or limited): ')
epochs = int(input('Number of epochs trained on model: '))
model_name = f"{training_type}-{model_dataset}-{epochs}epochs"

testing_set = input('Dataset for prediction generation (mix, twit0.825, or combined): ')

if training_type.lower() == 'regular':
    train_set = 'training'
    eval_set = 'testing'
elif training_type.lower() == 'limited':
    train_set = 'testing'
    eval_set = 'training'
else:
    print('Please enter a valid training type')

Dataset used to train model (mix, twit0.825, or combined): mix
Training type of model (regular or limited): regular
Number of epochs trained on model: 3
Dataset for prediction generation (mix, twit0.825, or combined): mix


In [None]:
# Prediction generation using the selected model and evaluation set 
# Type of evaluation set (training or test) is the opposite of the type used to train the selected model
import pandas as pd
import os
from simpletransformers.t5 import T5Model

print(f'---- Generating Predictions for Model: {model_name}  on Dataset: {testing_set}----')

# General parameters for prediction generation
model_args = {
    "overwrite_output_dir": True,
    "max_seq_length": 196,
    "eval_batch_size": 32,
    "num_train_epochs": 1,
    "use_multiprocessing": False,
    "num_beams": None,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

# Imports model and evaluation data set from current directory
model = T5Model("t5", model_name, args=model_args)
df = pd.read_csv(f'emotion-labeled-data/{model_dataset}/{testing_set}-{eval_set}_t5.tsv', sep="\t").astype(str)

# Generates inputs used for predictions
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
]

# Generates the predictions and adds them to the dataframe
preds = model.predict(to_predict)
preds = [pred[0] for pred in preds]
df["predictions"] = preds
df = df.drop(['Unnamed: 0'], axis=1)

# Ensures the directory exists for saving the data set
path = f'predictions-data/{model_name}'
if not os.path.exists(path):
    os.mkdir(path)

# Saves the file with predictions to a new directory for the scoring stage
df.to_csv(f'predictions-data/{model_name}/{testing_set}-predictions.tsv', sep="\t")

In [4]:
# Prediction generation using the selected model and evaluation set 
# Type of evaluation set (training or test) is the opposite of the type used to train the selected model
import pandas as pd
import os
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

model_name = model_name + "-bart"

print(f'---- Generating Predictions for Model: {model_name}  on Dataset: {testing_set}----')

# General parameters for prediction generation
model_args = {
    "overwrite_output_dir": True,
    "max_seq_length": 196,
    "eval_batch_size": 32,
    "num_train_epochs": 1,
    "use_multiprocessing": False,
    "num_beams": None,
    "do_sample": True,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 3,
}

# Imports model and evaluation data set from current directory
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name=model_name,
    args=model_args,
)
df = pd.read_csv(f'emotion-labeled-data/{model_dataset}/{testing_set}-{eval_set}_t5.tsv', sep="\t").astype(str)

# Generates inputs used for predictions
to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(df["prefix"].tolist(), df["input_text"].tolist())
]

# Generates the predictions and adds them to the dataframe
preds = model.predict(to_predict)
preds = [pred[0] for pred in preds]
df["predictions"] = preds
df = df.drop(['Unnamed: 0'], axis=1)

# Ensures the directory exists for saving the data set
path = f'predictions-data/{model_name}'
if not os.path.exists(path):
    os.mkdir(path)

# Saves the file with predictions to a new directory for the scoring stage
df.to_csv(f'predictions-data/{model_name}/{testing_set}-predictions.tsv', sep="\t")

---- Generating Predictions for Model: regular-mix-3epochs-bart  on Dataset: mix----


Generating outputs:   0%|          | 0/82 [00:00<?, ?it/s]