In [39]:
import pandas as pd
import numpy as np
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer

## a pretokenizer to segment the text into words
from tokenizers.pre_tokenizers import Whitespace

from collections import Counter

# Overview

In this notebook I will explore different tokenizers. The tokenizers that I will create are:

- word tokenizer
- wordpiece tokenizer
- byte pair tokenizer

In [24]:
# Load data
train = pd.read_parquet('../data/processed/sm/train_sm.gzip')
test = pd.read_parquet('../data/processed/sm/test_sm.gzip')

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 666666 entries, 0 to 666665
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   tweet        666666 non-null  object
 1   context      666666 non-null  object
 2   label        666666 non-null  int64 
 3   tweet_clean  666666 non-null  object
dtypes: int64(1), object(3)
memory usage: 20.3+ MB


In [26]:
train.isnull().sum()

tweet          0
context        0
label          0
tweet_clean    0
dtype: int64

In [27]:
train.head(3)

Unnamed: 0,tweet,context,label,tweet_clean
0,just finished downloading fall guys!!! gonna p...,Video Game,3,just finished downloading fall guys!!! gonna p...
1,Good thread on kink &amp; pride https://t.co/m...,Holiday,5,good thread on kink &amp; pride
2,$200 | 2.8 JT • 24 hrs • rt\n\n➡️ $100 - follo...,Technology,28,$200 | 2.8 jt • 24 hrs • rt ➡️ $100 - follow ✳...


# Word Tokenizer

nltk has a tweet tokenizer that I will use.

In [28]:
from nltk.tokenize import TweetTokenizer

In [29]:
tweet_tokenizer = TweetTokenizer()

In [33]:
tweet_tokenizer.tokenize(train['tweet_clean'][0])

['just',
 'finished',
 'downloading',
 'fall',
 'guys',
 '!',
 '!',
 '!',
 'gonna',
 'practice',
 'a',
 'little',
 'before',
 'stream',
 'sniping',
 'tf',
 'out',
 'of',
 'my',
 'faves',
 'tonight',
 '😈',
 'i',
 'cant',
 'wait',
 'mwahahahaha']

In [34]:
train['tweet_clean'][0]

'just finished downloading fall guys!!! gonna practice a little before stream sniping tf out of my faves tonight 😈 i cant wait mwahahahaha'

Apply tokenizer to all clean tweets

In [35]:
train_tokenized = train['tweet_clean'].apply(tweet_tokenizer.tokenize)
train_tokenized[:5]

0    [just, finished, downloading, fall, guys, !, !...
1                   [good, thread, on, kink, &, pride]
2    [$, 200, |, 2.8, jt, •, 24, hrs, •, rt, ➡, ️, ...
3    [i, cant, wait, to, hear, yettocome, by, bts, ...
4             [123, days, ., bring, brittney, home, .]
Name: tweet_clean, dtype: object

In [36]:
train['tokens'] = train_tokenized

In [47]:
test_tokenized = test['tweet_clean'].apply(tweet_tokenizer.tokenize)
test['tokens'] = test_tokenized

Create vocabulary

In [40]:
words = np.hstack(train['tokens'].values)
words

array(['just', 'finished', 'downloading', ..., 'forget', 'that', '?'],
      dtype='<U280')

In [41]:
word_counts = Counter(words)
word_counts.most_common(10)

[('.', 500875),
 ('the', 430423),
 (',', 325667),
 ('to', 300228),
 ('and', 221122),
 ('a', 212228),
 ('of', 191563),
 ('!', 178139),
 ('in', 173671),
 ('is', 156879)]

In [42]:
word_vocabulary = {'<PAD>': 0, '<UNK>': 1}
i = 2
for word, count in word_counts.items():
    if count > 10:
        word_vocabulary[word] = i
        i += 1

In [44]:
len(word_vocabulary)

39871

Save word vocabulary as json

In [45]:
import json

In [46]:
with open('../data/processed/vocabularies/word_token_vocab.json', 'w') as f:
    f.write(json.dumps(word_vocabulary))

Create datasets for train and test

In [49]:
import torch
from torch.utils.data import Dataset

In [75]:
class TweetDataset(Dataset):
    def __init__(self, data, vocab, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokens = row['tokens']

        # Front paddings
        X = torch.zeros(self.max_len)
        for i, token in enumerate(tokens):
            X[self.max_len - len(tokens) + i] = self.vocab.get(token, 1)

        y = torch.tensor(row['label']).float()

        return X.long(), y

In [80]:
max_len = max([len(tokens) for tokens in train['tokens']])
print(max_len)
train_ds = TweetDataset(train, word_vocabulary, max_len)
test_ds = TweetDataset(test, word_vocabulary, max_len)

256


In [83]:
# Save datasets
torch.save(train_ds, '../data/processed/sm/datasets/train_wdtk_sm_ds.pt')
torch.save(test_ds, '../data/processed/sm/datasets/test_wdtk_sm_ds.pt')

# Byte Subword Tokenizer