In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from typing import List

In [29]:
def prepareData(csv_filepath: str, verbose: bool = False) -> [List[str], List[str], List[int]]:
    """
    Prepares the data from csv files
    args:
        csv_filepath: str - the filepath of the csv file to load our authorship verification data from
        verbose: bool
    returns:
        FIRST_TEXTS: List[str] - list of strings in the first column of csv 
        SECOND_TEXTS: List[str] - list of strings in the second column of csv 
        LABELS: List[int] - one hot encoded labels for whether the first and second text are from the same author
    """
    FIRST_TEXTS, SECOND_TEXTS, LABELS = [],[],[]

    # Load the dataframe
    df = pd.read_csv(csv_filepath)
    # Iterate through and add to our first and second texts and labels
    for i in range(len(df)):
        FIRST_TEXTS.append(df.iloc[i, 0])
        SECOND_TEXTS.append(df.iloc[i, 1])
        LABELS.append(df.iloc[i, 2])
    
    # Ensure that the data is valid
    assert len(FIRST_TEXTS) == len(SECOND_TEXTS) == len(LABELS)
    if verbose: print("Prepared", len(df), "data points.")

    return FIRST_TEXTS, SECOND_TEXTS, LABELS

In [32]:
prepareData("../data/dev.csv", True)

Prepared 6000 data points.


(['Carol, Congratulations. Vince Carol Coats 10/12/2000 04:56 PM To: Vince J Kaminski/HOU/ECT@ECT cc: Subject: Re: Datren Williams Acceptance What a kind and thoughtful man you always are, Vince....I have always appreciated that in you! I just moved to another position this week - with Sheila Knudsen in ENA Compensation, but I will always remember how wonderful you are with the Associates and Analysts in the Program, and will miss having the opportunity to touch base with you occasionally. Thank you for being the special man you are! Carol P.S. I doubt that you know who I am, but I had worked with the program for two years, having been hired by Mike Smalling in October 1998. When I first moved to Houston',
  "This film has only ever been shown once in my neck of the woods and on a minor Sicilian TV channel at that so , despite its negative reputation , I've always wanted to see it . After all , it does have Boris Karloff playing the Baron for once ? even if , for some strange reason , 

In [9]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")

input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
print(input_ids)
print(type(input_ids))


tensor([[   37, 32099, 10681,    16, 32098,  2447,     1]])
<class 'torch.Tensor'>
