In [None]:
from datasets import load_dataset
SST = load_dataset("SetFit/sst2")
SST = SST['validation'] # Select dataset to preprocess ['train'], ['validation'], ['test']

In [None]:
id_to_char = ['<PAD>', '<CLS>', '<SEP>', '<MASK>', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', \
              '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']
char_to_id = {c:i for i,c in enumerate(id_to_char)}

In [None]:
rep = ['-lrb-', '-rrb-', 'ã§', 'ã¯', 'ã£', 'ã¨', 'ã»', 'ã¶', 'ã±', 'ã¢', 'ã-', 'ã¡', 'ã¦', 'ã³', 'ã©', 'ã¼', 'ü', 'û', 'ñ', 'ó', 'ô', 'ö', 'í', 'ï', 'mollã', 'jirã', 'ã', '\xad', '¼', '³', '¡', '¦', '\xa0', '¢', 'ç', '´', 'à', 'á', 'â', 'é', 'è', 'æ' ]
tok = ['('    , ')'    , 'c' , 'i' , 'a' , 'e' , 'u' , 'o' , 'n' , 'a' , 'i' , 'a' , 'ae', 'o' , 'e' , 'u' , 'u', 'u', 'n', 'o', 'o', 'o', 'i', 'i', 'molla', 'jiri', 'a', ''    , '' , '' , '' , '' , ''    , 'c', 'c', '' , 'a', 'a', 'a', 'e', 'e', 'ae']
assert(len(rep)==len(tok))

def refine_sentence(sent):
    sent = sent.lower()
    # Cut out \n|
    if sent[-1]=='\n': sent=sent[:-1]
    # Repleace LRB, RRB to (, ) respectively
    for f, t in zip(rep, tok):
        sent = sent.replace(f, t)
    return sent

In [None]:
input_ids = [] # Ids for each character
encoder_mask = [] # Mask for <PAD> tokens
word_idx = [] # Indexes of ends of each word
num_words = [] # Number of words
num_chars = [] # Number of characters
labels = []
len_limit = 256
for text, label in zip(SST['text'], SST['label']):
    text = refine_sentence(text)
    
    d = [char_to_id['<CLS>']] # for input_ids
    w = [-1] # for word_idx, -1 for <CLS>
    n = 1 # for num_words
    for j, c in enumerate(text):
        d.append(char_to_id[c])
        if c==' ':
            w.append(0)
            n+=1
        else:
            w.append(n)
            
    w.append(n)
    len_d = len(d)-1 # Length except <CLS> and <SEP>
    d.append(char_to_id['<SEP>'])
    if len(d)<=len_limit: # Only add sentences with acceptable length
        mask = [0]*len(d) + [1]*(len_limit-len(d)) # Create mask
        d += [char_to_id['<PAD>']]*(len_limit-len(d)) # PAD current sentence
        w.append(-1) # <SEP>
        w += [-1]*(len_limit-len(w)) 

        # Append all to dataset
        input_ids.append(d)
        encoder_mask.append(mask)
        word_idx.append(w)
        num_words.append(n)
        num_chars.append(len_d)
        labels.append(label)

    # Reset variables to start a new sentence
    d = [char_to_id['<CLS>']]
    w = [-1]
    n = 1

In [None]:
# A bit of data lost due to length limit
print(len(input_ids))

In [None]:
# Save dataset
import pandas as pd
dataset = {"input_ids":input_ids, "encoder_mask":encoder_mask, "label":labels, "word_idx":word_idx, "num_words":num_words, "num_chars":num_chars}
dataframe = pd.DataFrame(dataset).to_pickle("-your path-")

In [None]:
# samples
for i in range(10):
    print(SST['text'][i])
    for j in input_ids[i]:
        if j!=1: print(id_to_char[j], end='')
        if j==2: break
    print('\n',sum(encoder_mask[i]), labels[i], SST['label'][i])
    print()