In [18]:
import pandas as pd
import numpy as np
import random
import torch
from torch.utils.data import DataLoader

In [19]:
train_data = pd.read_csv("Data/mal_full_offensive_train.csv")

In [20]:
n = random.randint(0,train_data.shape[0])
text = train_data.iloc[n]['Text']

In [21]:
import sentencepiece as spm

In [22]:
tokenizer_dataset = train_data["Text"].tolist()

In [23]:
with open("tokenizer_corpus.txt", "w", encoding="utf-8") as file:
    for l in tokenizer_dataset:
        file.write(l + "\n")

In [24]:
vocab_size = 24000

In [25]:
spm.SentencePieceTrainer.train(
    input="tokenizer_corpus.txt",
    model_prefix="malayalam_spm",
    vocab_size=vocab_size,
    model_type="unigram",
    character_coverage=1.0,
    pad_id=0, unk_id=1, bos_id=2, eos_id=3
)

In [26]:
sp_tokenizer = spm.SentencePieceProcessor(model_file="malayalam_spm.model")

In [27]:
print("Tokens:", sp_tokenizer.encode_as_pieces(text))
print("IDs:", sp_tokenizer.encode_as_ids(text))
print("Decoded:", sp_tokenizer.decode(sp_tokenizer.encode_as_ids(text)))

Tokens: ['▁Best', '▁Malayalam', '▁movie', '▁trailer', '▁I', "'", 've', '▁seen', '!']
IDs: [541, 193, 26, 15, 57, 81, 740, 1412, 178]
Decoded: Best Malayalam movie trailer I've seen!


## Check tokenizer with validation and test data

In [28]:
valid_data = pd.read_csv("Data/mal_full_offensive_dev.csv")

In [29]:
n = random.randint(0,valid_data.shape[0])
valid_text = valid_data.iloc[n]['Text']

In [30]:
print("Tokens:", sp_tokenizer.encode_as_pieces(valid_text))
print("IDs:", sp_tokenizer.encode_as_ids(valid_text))
print("Decoded:", sp_tokenizer.decode(sp_tokenizer.encode_as_ids(valid_text)))

Tokens: ['▁Ikka', 's', '▁padakkam', '▁number', '▁0', '4', '▁of', '▁20', '18', '▁loading']
IDs: [99, 27, 3853, 1814, 14437, 1105, 149, 822, 5418, 627]
Decoded: Ikkas padakkam number 04 of 2018 loading


In [31]:
test_data = pd.read_csv("Data/mal_offensive_test_with_labels.csv")

In [32]:
n = random.randint(0,test_data.shape[0])
test_text = test_data.iloc[n]['Text']

In [33]:
print("Tokens:", sp_tokenizer.encode_as_pieces(test_text))
print("IDs:", sp_tokenizer.encode_as_ids(test_text))
print("Decoded:", sp_tokenizer.decode(sp_tokenizer.encode_as_ids(test_text)))

Tokens: ['▁raju', 'ne', '▁kannan', '▁vendii', '▁veendum', '▁trailer', '▁kandavar', '▁ind', 'oo', '??']
IDs: [1111, 258, 1316, 17225, 229, 15, 267, 813, 272, 331]
Decoded: rajune kannan vendii veendum trailer kandavar indoo??
