In [None]:
!pip install simpletransformers

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'
torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
dataset_name = input('Dataset to train model (google, msr, quora, mix, twit0.825 or combined):  ')
training_type = input('Regular or limited training (regular or limited): ')
epochs = int(input('Number of training epochs: '))

if training_type.lower() == 'regular':
    train_set = 'training'
    eval_set = 'testing'
elif training_type.lower() == 'limited':
    train_set = 'testing'
    eval_set = 'training'
else:
    print('Please enter a valid training type')
    
emo_filter_list = [
                   't5',
#                   'bart', 
#                   'gpt', 
#                   'nil', 
#                   'sid',
#                   'sid_rg', 
                   'emo', 
                   'emo_ge',
#                   'emo_nn',
#                   'emo_sid', 
#                   'emo_sid_nn',
#                   'emo_sid_tg', 
#                   'emo_sid_tg_nn', 
#                   'emo_sid_tg_ge', 
#                   'emo_sid_tg_nn_ge'
                  ]

In [None]:
# Fine-tuning T5 by the specified data set, training type, and number of epochs specified above
import pandas as pd
from simpletransformers.t5 import T5Model

def trainModels(emo_filter):
    training_df = pd.read_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-{train_set}_t5.tsv', sep="\t").astype(str)
    testing_df = pd.read_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-{eval_set}_t5.tsv', sep="\t").astype(str)
    
    model_args = {
        "max_seq_length": 196,
        "train_batch_size": 6,
        "eval_batch_size": 32,
        "num_train_epochs": epochs,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 15000,
        "evaluate_during_training_verbose": True,

        "use_multiprocessing": False,
        "fp16": False,

        "save_steps": -1,
        "save_eval_checkpoints": False,
        "save_model_every_epoch": False,

        "reprocess_input_data": True,
        "overwrite_output_dir": True,

        "output_dir": f"{training_type}-{dataset_name}-{emo_filter}-{epochs}epochs", 

    }

    model = T5Model("t5", "t5-base", args=model_args)
    model.train_model(training_df, eval_data=testing_df)
    
    
for emo_filter in emo_filter_list:
    print("Training on " + emo_filter)
    if (emo_filter == 't5'):
        print('T5_Base Pretrained.')
    else: 
        trainModels(emo_filter)

In [None]:
# Fine-tuning BART by the specified data set, training type, and number of epochs specified above
import pandas as pd
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

def trainModels(emo_filter):
    training_df = pd.read_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-{train_set}_t5.tsv', sep="\t").astype(str)
    testing_df = pd.read_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-{eval_set}_t5.tsv', sep="\t").astype(str)

    # General parameters for fine-tuning
    model_args = {
        "max_seq_length": 196,
        "train_batch_size": 4,
        "eval_batch_size": 32,
        "num_train_epochs": epochs,
        "evaluate_during_training": True,
        "evaluate_during_training_steps": 15000,
        "evaluate_during_training_verbose": True,
    
        "use_multiprocessing": False,
        "fp16": False,

        "save_steps": -1,
        "save_eval_checkpoints": False,
        "save_model_every_epoch": False,

        "reprocess_input_data": True,
        "overwrite_output_dir": True,
    
        # Creates special directory to hold each new model
        "output_dir": f"{training_type}-{dataset_name}-{emo_filter}-{epochs}epochs-bart", 
    }


    # Use bart-large as the model to be fine-tuned
    model = Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name="facebook/bart-base",
        args=model_args,
    )

    # Fine-tuning the bart model
    model.train_model(training_df, eval_data=testing_df)
    
for emo_filter in emo_filter_list:
    if (emo_filter == 'bart'):
        print('Bart_Base Pretrained.')
    else: 
        print('Training Bart Base on ' + emo_filter)
        trainModels(emo_filter)

In [None]:
# Fine-tuning GPT2 by the specified data set, training type, and number of epochs specified above
import pandas as pd
#from simpletransformers.language_modeling import LanguageModelingModel, LanguageModelingArgs
from transformers import (
    AutoModelWithLMHead,
    AutoConfig,
    Trainer,
    AutoTokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    TrainingArguments
)

import os
os.environ["WANDB_DISABLED"] = "true"

def convertGPTinput(emo_filter):
    df = pd.read_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-training_t5.tsv', encoding='utf-8', sep='\t')[['prefix', 'input_text', 'target_text']]
    df['combined'] = '<s>' + df.prefix + ': ' + df.input_text + '</s>'+ '>>>>' + '<p>' + df.target_text + '</p>'
    df['combined'] = df.combined.to_csv(f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-training_gpt.txt', sep='\n', index = False)

def trainGPT(emo_filter, text_path, epochs, model='gpt2', batch_size=6, cache_dir='cache'):
    model = AutoModelWithLMHead.from_pretrained(model)
    tokenizer = AutoTokenizer.from_pretrained('gpt2')
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path = text_path,
        block_size = 256
    )
    
    training_args = TrainingArguments(
        output_dir = f"{training_type}-{dataset_name}-{emo_filter}-{epochs}epochs-gpt",
        num_train_epochs = epochs,
        per_device_train_batch_size = batch_size,
        warmup_steps = 500,
        save_steps = 2000,
        logging_steps = 500
    )
    
    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator = data_collator,
        train_dataset = train_dataset
    )
    
    trainer.train()
    trainer.save_model()

        
for emo_filter in emo_filter_list:
    if (emo_filter == 'gpt'):
        print('GPT2 Pretrained.')
    else: 
        print('Training GPT2 on ' + emo_filter)
        convertGPTinput(emo_filter)
        trainGPT(
            emo_filter=emo_filter,
            text_path=f'emotion-labeled-data/{dataset_name}/{dataset_name}-{emo_filter}-training_gpt.txt',
            epochs=3
        )