# Text Data Processing with PyTorch

[Original tutorial](https://towardsdatascience.com/use-torchtext-to-load-nlp-datasets-part-i-5da6f1c89d84)

In [None]:
import os
import re
import spacy
import torch

import numpy as np
import pandas as pd

from torch.utils.data import (
    Dataset
    , DataLoader
    , random_split
)

In [None]:
# This is a small subset of the full toxic comment dataset
df_sample = pd.read_csv("./sample.csv", index_col="id")
df_sample.head(5)

In [164]:
class Vocabulary(object):

    def __init__(self, freq_bar=5):

        self.is_built = False
        self.freq_bar = freq_bar
        self.tokenizer = spacy.load("en_core_web_sm")
        self.itos = {
            0: "<PAD>"    # Padding
            , 1: "<SOS>"  # Start of Sentence
            , 2: "<EOS>"  # End of Sentence
            , 3: "<UNK>"  # Unknown
        }
        self.stoi = {
            "<PAD>": 0
            , "<SOS>": 1
            , "<EOS>": 2
            , "<UNK>": 3
        }
        return

    def __len__(self):
        return len(self.itos)

    def tokenize(self, text, max_num_char=2000):

        # Chop off long string and tokenize
        text = str(text)[:max_num_char]

        # Preprocess special symbols before tokenization
        text = re.sub(r"[\*\"“”\r\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", text)
        text = re.sub(r"[ ]+", " ", text)
        text = re.sub(r"\!+", "!", text)
        text = re.sub(r"\,+", ",", text)
        text = re.sub(r"\?+", "?", text)

        # Use spaCy tokenizer
        ret = [t.text for t in self.tokenizer(text) if t.text != " "]
        return ret

    def build_vocab(self, list_texts):

        freq_stat, idx = {}, 4  # idx 3 is <UNK>
        for text in list_texts:
            for token in self.tokenize(text):

                if (token not in freq_stat):
                    freq_stat[token] = 1
                else:
                    freq_stat[token] += 1

                if (freq_stat[token] == self.freq_bar):
                    self.stoi[token] = idx
                    self.itos[idx] = token
                    idx += 1
        self.is_built = True
        return

    def text_to_num(self, text):

        if (not self.is_built):
            raise(Exception("[ ERROR ] :: Vocabulary not built"))
        try:
            ret = [self.stoi.get(token, self.stoi["<UNK>"]) for token in  self.tokenize(text)]
        except:
            print(f"[ ERROR ] :: Tokenization failed for < {text} >")
        return [self.stoi["<SOS>"]] + ret + [self.stoi["<EOS>"]]
        

In [140]:
class ToxicComtDataset(Dataset):

    def __init__(self, path):

        # Index:
        #   "id"
        # Predictor/Factor:
        #   "comment_text"
        # Labels/Targets:
        #   "toxic"
        #   "severe_toxic"
        #   "obscene"
        #   "threat"
        #   "insult"
        #   "identity_hate"
        self.df = pd.read_csv(path, index_col="id")
        self.factor_raw = self.df["comment_text"]

        # Build a vocabulary
        self.vocab = Vocabulary()
        self.vocab.build_vocab(self.factor_raw)

        # A temporary transformer for numericalize comments
        def text_to_tensor(text):

            ret = self.vocab.text_to_num(text)
            ret = torch.tensor(ret, dtype=torch.int32)
            return ret

        # Apply to all comments
        self.factor = self.factor_raw.apply(text_to_tensor)

        # Select just one target for binary classification
        target_col = "toxic"
        self.target = torch.tensor(
            self.df.loc[:, target_col].values, dtype=torch.uint8
        )

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.factor[idx], self.target[idx]

In [None]:
class ToxicComtDataLoader(object):

    def __init__(self, config):

        return

## Load Tabular Dataset

In [81]:
x = pd.Series([[1, 2, 3], [2, 1, 3]])

In [165]:
ds = ToxicComtDataset("sample.csv")

In [166]:
lo = DataLoader(ds, batch_size=1, shuffle=True)

In [168]:
_, (x, y) = next(enumerate(lo))
s = ' '.join([ds.vocab.itos[int(i)] for i in x[0]])
print(s, len(s), y)

<SOS> Yes , it looks much better than before . At the top of the page , it says This article needs additional citations for <UNK> , may be <UNK> encyclopedia could help adding citations ... I <UNK> . Cheers , <EOS> 214 tensor([0], dtype=torch.uint8)


In [170]:
ds.vocab.text_to_num("Hi there, I wonder what is going on here")

[1, 266, 105, 7, 6, 3, 79, 22, 139, 8, 176, 2]