### Load data

In [7]:
import numpy as np
import pandas as pd

# train = pd.read_csv('data/train.csv', index_col='ex_id')
# dev = pd.read_csv('data/dev.csv', index_col='ex_id')
# test = pd.read_csv('data/test_no_label.csv', index_col='ex_id')

### Tokenize

In [8]:
import spacy

In [9]:
# nlp = spacy.load('en_core_web_sm')
# nlp.Defaults.stop_words

In [10]:
def tokenize(data):
    
    nlp = spacy.load('en_core_web_sm')
    data_tokenized = []
    
    def tokenize_(review):
        doc = nlp(review)
        tokens = []
        for tk in doc:
            if tk.is_punct or tk.is_stop or tk.is_space:
                continue # discard if is punctuation / stopword / whitespace
            elif any([char.isdigit() for char in tk.text]):
                continue # discard if is a number
            else:
                tokens.append(tk.lemma_.lower())
        return tokens
    
    for review in data:
        tokens = tokenize_(review)
        data_tokenized.append(tokens)

    return data_tokenized

In [11]:
def get_vocab_set(data_tokenized):
    return set.union(*[set(tokens) for tokens in data_tokenized])

### Save

In [14]:
import pickle as pkl

In [13]:
# Tain
train_data_tokens = tokenize(train.review.values)
pkl.dump(train_data_tokens, open("train_data_tokens.pkl", "wb"))

# Val
val_data_tokens = tokenize(dev.review.values)
pkl.dump(val_data_tokens, open("val_data_tokens.pkl", "wb"))

# Test
test_data_tokens = tokenize(test.review.values)
pkl.dump(test_data_tokens, open("test_data_tokens.pkl", "wb"))

# Vocab
all_train_tokens = get_vocab_set(train_data_tokens)
pkl.dump(all_train_tokens, open("all_train_tokens.pkl", "wb"))

### Load Tokens

In [12]:
import pickle as pkl
from itertools import chain

In [15]:
# Then, load preprocessed train, val and test datasets
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))

# double checking
print ("Train dataset size is {}".format(len(train_data_tokens)))
print ("Val dataset size is {}".format(len(val_data_tokens)))
print ("Test dataset size is {}".format(len(test_data_tokens)))


all_train_tokens = list(chain.from_iterable(train_data_tokens))
all_train_tokens_set = set(all_train_tokens)

print ("Total number of tokens in train dataset is {}".format(len(all_train_tokens)))
print ("Total number of *unique* tokens in train dataset is {}".format(len(set(all_train_tokens))))

Train dataset size is 250874
Val dataset size is 35918
Test dataset size is 72165
Total number of tokens in train dataset is 12969997
Total number of *unique* tokens in train dataset is 114051
