In [152]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss
from nltk.tokenize import word_tokenize
import nltk
import sentencepiece as spm
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [153]:
from data_reader import MultiLangDataset, SplitSet
from data_reader import ns_dataset, as_dataset
from data_reader import Languages

In [154]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [156]:
# 3. Custom Transformer using SentencePiece
class SentencePieceVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_file='yoruba.model', vocab_size=200):
        self.model_file = model_file
        self.vocab_size = vocab_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(self.model_file)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rows, cols, data = [], [], []
        for i, text in enumerate(X):
            ids = self.sp.encode(text, out_type=int)
            for idx in ids:
                rows.append(i)
                cols.append(idx)
                data.append(1)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocab_size))

In [157]:



def analyze_text(lang: Languages):

    def compare_results(normal_result: dict, subword_result: dict):
        # Convert to DataFrames
        df = pd.DataFrame(normal_result).transpose()
        subword_df = pd.DataFrame(subword_result).transpose()

        print(f'Results for {lang} Language:')
        print("Normal Tokenization Results:")
        print(df.round(3))
        print("--------------------------------------------------")
        print("Subword Tokenization Results:")
        print(subword_df.round(3))
        print("--------------------------------------------------")

    def delete_files():
        import os
        for filename in ['tweets.txt', 'lang_model.model', 'lang_model.vocab']:
            try:
                os.remove(filename)
            except FileNotFoundError:
                pass

    lang_dataset: SplitSet = ns_dataset.get(lang)
    X_train = lang_dataset.train["tweet"]
    lang_dataset.train["tweet"].to_csv('tweets.txt', index=False, header=False)
    y_train = lang_dataset.train["label"]

    X_test = lang_dataset.test["tweet"]
    y_test = lang_dataset.test["label"]
    stop_words = lang_dataset.stopwords

    # Peform word tokenization
    vectorizer = CountVectorizer(tokenizer=word_tokenize,stop_words=stop_words)
    model = Pipeline([
        ('vectorizer', vectorizer),  # word-level tokenizer by default
        ('classifier', MultinomialNB())
    ])
    # Train model
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    word_result: dict = classification_report(y_test, y_pred, output_dict=True)


    # SentencePiece tokenizer
    spm.SentencePieceTrainer.Train(input='tweets.txt', model_prefix='lang_model', vocab_size=8000, model_type='bpe')
    sp = spm.SentencePieceProcessor()
    sp.load('lang_model.model')

    def encode_as_bow(tweets, vocab_size):
        rows, cols, data = [], [], []
        for i, tweet in enumerate(tweets):
            ids = sp.encode(tweet, out_type=int)
            for idx in ids:
                rows.append(i)
                cols.append(idx)
                data.append(1)
        return csr_matrix((data, (rows, cols)), shape=(len(tweets), vocab_size))

    X_yor_train = encode_as_bow(X_train, vocab_size=8000)
    X_yor_test = encode_as_bow(X_test, vocab_size=8000)

    clf = MultinomialNB()
    clf.fit(X_yor_train, y_train)

    y_pred = clf.predict(X_yor_test)

    spm_result: dict = classification_report(y_test, y_pred, output_dict=True)
    # loss = log_loss(y_test, y_pred)
    # print(f"Loss for {lang} Language: {loss:.3f}")
    
    compare_results(word_result, spm_result)
    delete_files()

    

In [158]:
analyze_text(Languages.YORUBA)
analyze_text(Languages.HAUSA)
analyze_text(Languages.IGBO)
analyze_text(Languages.NIGERIAN_PIDGIN)



Results for yor Language:
Normal Tokenization Results:
              precision  recall  f1-score   support
negative          0.649   0.542     0.591   981.000
neutral           0.719   0.701     0.710  1616.000
positive          0.742   0.820     0.779  1918.000
accuracy          0.717   0.717     0.717     0.717
macro avg         0.703   0.688     0.693  4515.000
weighted avg      0.714   0.717     0.713  4515.000
--------------------------------------------------
Subword Tokenization Results:
              precision  recall  f1-score   support
negative          0.487   0.715     0.579   981.000
neutral           0.748   0.533     0.623  1616.000
positive          0.745   0.746     0.745  1918.000
accuracy          0.663   0.663     0.663     0.663
macro avg         0.660   0.665     0.649  4515.000
weighted avg      0.690   0.663     0.665  4515.000
--------------------------------------------------




Results for hau Language:
Normal Tokenization Results:
              precision  recall  f1-score   support
negative          0.580   0.819     0.679  1759.000
neutral           0.702   0.434     0.537  1789.000
positive          0.789   0.770     0.779  1755.000
accuracy          0.673   0.673     0.673     0.673
macro avg         0.690   0.674     0.665  5303.000
weighted avg      0.690   0.673     0.664  5303.000
--------------------------------------------------
Subword Tokenization Results:
              precision  recall  f1-score   support
negative          0.549   0.873     0.674  1759.000
neutral           0.736   0.394     0.513  1789.000
positive          0.852   0.752     0.799  1755.000
accuracy          0.671   0.671     0.671     0.671
macro avg         0.712   0.673     0.662  5303.000
weighted avg      0.712   0.671     0.661  5303.000
--------------------------------------------------




Results for ibo Language:
Normal Tokenization Results:
              precision  recall  f1-score   support
negative          0.769   0.561     0.649   943.000
neutral           0.682   0.813     0.742  1621.000
positive          0.767   0.728     0.747  1118.000
accuracy          0.723   0.723     0.723     0.723
macro avg         0.739   0.701     0.713  3682.000
weighted avg      0.730   0.723     0.720  3682.000
--------------------------------------------------
Subword Tokenization Results:
              precision  recall  f1-score   support
negative          0.795   0.509     0.621   943.000
neutral           0.656   0.903     0.760  1621.000
positive          0.849   0.644     0.732  1118.000
accuracy          0.723   0.723     0.723     0.723
macro avg         0.767   0.685     0.704  3682.000
weighted avg      0.750   0.723     0.716  3682.000
--------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Results for pcm Language:
Normal Tokenization Results:
              precision  recall  f1-score   support
negative          0.632   0.929     0.753  2326.000
neutral           0.000   0.000     0.000   431.000
positive          0.703   0.369     0.484  1397.000
accuracy          0.645   0.645     0.645     0.645
macro avg         0.445   0.433     0.412  4154.000
weighted avg      0.590   0.645     0.584  4154.000
--------------------------------------------------
Subword Tokenization Results:
              precision  recall  f1-score   support
negative          0.639   0.899     0.747  2326.000
neutral           0.000   0.000     0.000   431.000
positive          0.648   0.408     0.501  1397.000
accuracy          0.641   0.641     0.641     0.641
macro avg         0.429   0.436     0.416  4154.000
weighted avg      0.576   0.641     0.587  4154.000
--------------------------------------------------
