# Grammerly Dataset Tokenizer

## Load Data

In [4]:
#Install
!pip install sentencepiece



In [0]:
#Packages
import pandas as pd
import sentencepiece as spm
import numpy as np
import torch
import pickle

In [5]:
#Get data from drive 
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
#Training and test datapaths
root_path = 'drive/My Drive/Colab_Notebooks/deeplearning/Sentence_VAE/'
#root_path = 'drive/My Drive/Sentence_VAE/'

#Family/Relationships training data
FR_train_formal_file = root_path + 'GYAFC_Corpus/Family_Relationships/train/formal'
FR_train_informal_file = root_path + 'GYAFC_Corpus/Family_Relationships/train/informal'
FR_test_raw_formal_file = root_path + 'GYAFC_Corpus/Family_Relationships/test/formal'
FR_test_raw_informal_file = root_path + 'GYAFC_Corpus/Family_Relationships/test/informal'
FR_test_ref0_formal_file = root_path + 'GYAFC_Corpus/Family_Relationships/test/formal.ref0'
FR_test_ref0_informal_file = root_path + 'GYAFC_Corpus/Family_Relationships/test/informal.ref0'


#Entertainment/Music training data
EM_train_formal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/train/formal'
EM_train_informal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/train/informal'
EM_test_raw_formal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/test/formal'
EM_test_raw_informal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/test/informal'
EM_test_ref0_formal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/test/formal.ref0'
EM_test_ref0_informal_file = root_path + 'GYAFC_Corpus/Entertainment_Music/test/informal.ref0'

In [0]:
# Open Files
with open(FR_train_formal_file) as f: FR_formal = [line.rstrip() for line in f]
with open(FR_train_informal_file) as f: FR_informal = [line.rstrip() for line in f]
with open(FR_test_raw_formal_file) as f: FR_test_raw_formal = [line.rstrip() for line in f]
with open(FR_test_raw_informal_file) as f:FR_test_raw_informal = [line.rstrip() for line in f]
with open(FR_test_ref0_formal_file) as f: FR_test_ref0_formal = [line.rstrip() for line in f]
with open(FR_test_ref0_informal_file) as f:FR_test_ref0_informal = [line.rstrip() for line in f]


with open(EM_train_formal_file) as f: EM_formal = [line.rstrip() for line in f] 
with open(EM_train_informal_file) as f: EM_informal = [line.rstrip() for line in f]
with open(EM_test_raw_formal_file) as f: EM_test_raw_formal = [line.rstrip() for line in f]
with open(EM_test_raw_informal_file) as f: EM_test_raw_informal = [line.rstrip() for line in f]
with open(EM_test_ref0_formal_file) as f: EM_test_ref0_formal = [line.rstrip() for line in f]
with open(EM_test_ref0_informal_file) as f: EM_test_ref0_informal = [line.rstrip() for line in f]

In [0]:
##Create full dataset
#Train
full_train_formal = FR_formal + EM_formal
full_train_informal = FR_informal + EM_informal

#Test
full_test_formal = FR_test_raw_formal + FR_test_ref0_formal + EM_test_raw_formal + EM_test_ref0_formal
full_test_informal = FR_test_raw_informal + FR_test_ref0_informal + EM_test_raw_informal + EM_test_ref0_informal

In [9]:
print(len(full_train_formal))
print(len(full_train_informal))
print(len(full_test_formal))
print(len(full_test_informal))

104562
104562
4849
4849


## Preprocess Data

## Iterative explanation of processing

In [0]:
#Define vocabulary size
VOCAB_SIZE = 20000

In [0]:
# Load sentencepiece model (See local notebook for how to create the sentencepiece model)
sp = spm.SentencePieceProcessor()
sp.Load(root_path + 'fulltraintest32000.model')

True

In [0]:
#Print ids
print('pad_id =', sp.pad_id())
print('unk_id =', sp.unk_id())
print('bos_id =', sp.bos_id())
print('eos_id =', sp.eos_id())

pad_id = 0
unk_id = 1
bos_id = 2
eos_id = 3


In [0]:
#Tokenize
tokenized_formal = np.array([sp.encode_as_ids(line) for line in full_train_formal])
tokenized_informal = np.array([sp.encode_as_ids(line) for line in full_train_informal])

#How to decode #sp.decode_ids(tokenized_formal[idx])

### Cutting sentence lengths

In [0]:
#Lengths of tokens
tokenized_lens_formal = np.array([len(element) for element in tokenized_formal])
tokenized_lens_informal = np.array([len(element) for element in tokenized_informal])
print(f"Longest formal sequence: {tokenized_lens_formal.max()}")
print(f"Longest informal sequence: {tokenized_lens_informal.max()}")

Longest formal sequence: 68
Longest informal sequence: 254


In [0]:
#Cutting Tokens 
SENTENCE_CUT_LEN = 10

sub_tokenized_formal = tokenized_formal[((tokenized_lens_formal <= SENTENCE_CUT_LEN) & (tokenized_lens_informal <= SENTENCE_CUT_LEN))]
sub_tokenized_informal = tokenized_informal[((tokenized_lens_formal <= SENTENCE_CUT_LEN) & (tokenized_lens_informal <= SENTENCE_CUT_LEN))]
sub_formal = np.array(full_train_formal)[((tokenized_lens_formal <= SENTENCE_CUT_LEN) & (tokenized_lens_informal <= SENTENCE_CUT_LEN))]
sub_informal = np.array(full_train_informal)[((tokenized_lens_formal <= SENTENCE_CUT_LEN) & (tokenized_lens_informal <= SENTENCE_CUT_LEN))]

print('Formal data subset size :', sub_tokenized_formal.size)
print('Informal data subset size:', sub_tokenized_informal.size)

Formal data subset size : 27640
Informal data subset size: 27640


## Padding dataset

In [0]:
def pad_token_array(array, padded_len):
    len_array = len(array)
    padding = (padded_len - len_array) * [0]
    bos = [2]
    eos = [3]
    return torch.LongTensor(bos + array + padding + eos)

In [0]:
#Pad tokenized sentences
padded_sub_tokenized_formal = [pad_token_array(element, SENTENCE_CUT_LEN) for element in sub_tokenized_formal]
padded_sub_tokenized_informal = [pad_token_array(element, SENTENCE_CUT_LEN) for element in sub_tokenized_informal]

In [0]:
# Test if all are 12
for x in padded_sub_tokenized_formal: assert len(x) == SENTENCE_CUT_LEN+2 
for x in padded_sub_tokenized_informal: assert len(x) == SENTENCE_CUT_LEN+2

In [0]:
padded_training_data = padded_sub_tokenized_formal + padded_sub_tokenized_informal
print('Length of padded training data : ', len(padded_training_data))

Length of padded training data :  55280


## Dumping file

In [0]:
pickle.dump(padded_training_data, open(root_path+'tokenized_training.p', 'wb'))

### Easier Way with a single function

In [0]:
## In function
def sentence_preprocessing(formal_sentences, informal_sentences, sentence_cut_len, f_sp):
  tokenized_formal = np.array([f_sp.encode_as_ids(line) for line in formal_sentences])
  tokenized_informal = np.array([f_sp.encode_as_ids(line) for line in informal_sentences])

  tokenized_lens_formal = np.array([len(element) for element in tokenized_formal])
  tokenized_lens_informal = np.array([len(element) for element in tokenized_informal])

  filtered_tokenized_formal = tokenized_formal[((tokenized_lens_formal <= sentence_cut_len) & (tokenized_lens_informal <= sentence_cut_len))]
  filtered_tokenized_informal = tokenized_informal[((tokenized_lens_formal <= sentence_cut_len) & (tokenized_lens_informal <= sentence_cut_len))]

  padded_filtered_tokenized_formal = [pad_token_array(element, sentence_cut_len) for element in filtered_tokenized_formal]
  padded_filtered_tokenized_informal = [pad_token_array(element, sentence_cut_len) for element in filtered_tokenized_informal]
  return padded_filtered_tokenized_formal, padded_filtered_tokenized_informal

#### big data set of FR and EM data

In [0]:
# Load sentencepiece model (See local notebook for how to create the sentencepiece model)
sp_large = spm.SentencePieceProcessor()
sp_large.Load(root_path + 'fulltraintest32000.model')

True

In [0]:
EM_processed_train_formal, EM_processed_train_informal = sentence_preprocessing(EM_formal, EM_informal, 10, sp_large)
EM_processed_test_formal_1, EM_processed_test_informal_1 = sentence_preprocessing(EM_test_raw_formal, EM_test_ref0_informal, 10, sp_large)
EM_processed_test_formal_2, EM_processed_test_informal_2 = sentence_preprocessing(EM_test_ref0_formal, EM_test_raw_informal, 10, sp_large)

EM_processed_test_formal = EM_processed_test_formal_1 + EM_processed_test_formal_2
EM_processed_test_informal = EM_processed_test_informal_1 + EM_processed_test_informal_2


In [0]:
FR_processed_train_formal, FR_processed_train_informal = sentence_preprocessing(FR_formal, FR_informal, 10, sp_large)
FR_processed_test_formal_1, FR_processed_test_informal_1 = sentence_preprocessing(FR_test_raw_formal, FR_test_ref0_informal, 10, sp_large)
FR_processed_test_formal_2, FR_processed_test_informal_2 = sentence_preprocessing(FR_test_ref0_formal, FR_test_raw_informal, 10, sp_large)

FR_processed_test_formal = FR_processed_test_formal_1 + FR_processed_test_formal_2
FR_processed_test_informal = FR_processed_test_informal_1 + FR_processed_test_informal_2

In [0]:
EM_dict = {"train": {"formal": EM_processed_train_formal, "informal": EM_processed_train_informal}, "test": {"formal": EM_processed_test_formal, "informal": EM_processed_test_informal}}
FR_dict = {"train": {"formal": FR_processed_train_formal, "informal": FR_processed_train_informal}, "test": {"formal": FR_processed_test_formal, "informal": FR_processed_test_informal}}
large_data_dict = {"EM": EM_dict, "FR": FR_dict}



In [0]:
pickle.dump(large_data_dict, open(root_path+'FREM_data_set_sub10.p', 'wb'))

#### Small data set of only FR data

In [0]:
# Load sentencepiece model (See local notebook for how to create the sentencepiece model)
sp_small = spm.SentencePieceProcessor()
sp_small.Load(root_path + 'FRtraintest16000.model')

True

In [0]:
FR_processed_train_formal, FR_processed_train_informal = sentence_preprocessing(FR_formal, FR_informal, 10, sp_small)
FR_processed_test_formal_1, FR_processed_test_informal_1 = sentence_preprocessing(FR_test_raw_formal, FR_test_ref0_informal, 10, sp_small)
FR_processed_test_formal_2, FR_processed_test_informal_2 = sentence_preprocessing(FR_test_ref0_formal, FR_test_raw_informal, 10, sp_small)

FR_processed_test_formal = FR_processed_test_formal_1 + FR_processed_test_formal_2
FR_processed_test_informal = FR_processed_test_informal_1 + FR_processed_test_informal_2

In [0]:
FR_dict2 = {"train": {"formal": FR_processed_train_formal, "informal": FR_processed_train_informal}, "test": {"formal": FR_processed_test_formal, "informal": FR_processed_test_informal}}

In [0]:
pickle.dump(FR_dict2, open(root_path+'FR_data_set_sub10.p', 'wb'))

## Some Visualization stuff

In [0]:
#Create dataframes 
train_df = pd.DataFrame({'Formal': full_train_formal, 'Informal': full_train_informal})
#test_df = pd.DataFrame({'Formal': full_test_formal, 'Informal': full_test_informal})
train_df.head()

Unnamed: 0,Formal,Informal
0,I prefer to let the guy ask me.,"Sure, it's ok, but I always have let the guy a..."
1,I suffer through verbal abuse from my wife.,"Hmmm, I'm a guy suffering from verbal abuse fr..."
2,You will have more friends than you want.,You will have more friends that you want... ;)
3,It's nice that you get to see pictures of who ...,"It's nice, you get to see pictures of who you ..."
4,I need to know what to do.,I NEED TO KNOW WHAT 2 DO


In [0]:
for _, row in train_df.sample(n=10).iterrows():
    print(f"Formal: {row.Formal}\nInformal: {row.Informal}\n")

Formal: It is an art.
Informal: is an art to say the lest.

Formal: These things should not be hurried. Start your lives over little by little, if you are able.
Informal: try to start your lives over little at a time, i belive these things shouldnt be rushed.

Formal: You never know what you can find for 50¢ or 25¢.
Informal: you never know what you could find...for probably around a quarter or two.

Formal: Probably because you think and act in a homosexual nature, and you like it.
Informal: probably because you think in a gay way, act in a gay way...and you like it

Formal: I feel it is too soon to jump into a relationship with someone else, as it will not last long.
Informal: and its too soon to jump into something with someone else, it wont last long.

Formal: What kind of loser are you?
Informal: what kind of loser are you????

Formal: I found the menu very unsatisfying.
Informal: it wasnot satisfied with the menu...

Formal: It seems like these men had a hard time adjusting to li