In [145]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss
from nltk.tokenize import word_tokenize
import nltk
import sentencepiece as spm
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

In [146]:
from data_reader import MultiLangDataset, SplitSet
from data_reader import ns_dataset, as_dataset
from data_reader import Languages

In [147]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [148]:
yor_dataset: SplitSet = ns_dataset.get(Languages.YORUBA)
test = yor_dataset.train["label"]
print(test.unique())

['negative' 'neutral' 'positive']


In [149]:
# 3. Custom Transformer using SentencePiece
class SentencePieceVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model_file='yoruba.model', vocab_size=200):
        self.model_file = model_file
        self.vocab_size = vocab_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(self.model_file)
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        rows, cols, data = [], [], []
        for i, text in enumerate(X):
            ids = self.sp.encode(text, out_type=int)
            for idx in ids:
                rows.append(i)
                cols.append(idx)
                data.append(1)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocab_size))

In [None]:



def analyze_text(lang: Languages):

    def compare_results(normal_result: dict, subword_result: dict):
        # Convert to DataFrames
        df = pd.DataFrame(normal_result).transpose()
        subword_df = pd.DataFrame(subword_result).transpose()

        print(f'Results for {lang} Language:')
        print("Normal Tokenization Results:")
        print(df.round(3))
        print("--------------------------------------------------")
        print("Subword Tokenization Results:")
        print(subword_df.round(3))
        print("--------------------------------------------------")

    def delete_files():
        import os
        for filename in ['tweets.txt', 'lang_model.model', 'lang_model.vocab']:
            try:
                os.remove(filename)
            except FileNotFoundError:
                pass

    lang_dataset: SplitSet = ns_dataset.get(lang)
    X_train = lang_dataset.train["tweet"]
    lang_dataset.train["tweet"].to_csv('tweets.txt', index=False, header=False)
    y_train = lang_dataset.train["label"]

    X_test = lang_dataset.test["tweet"]
    y_test = lang_dataset.test["label"]
    stop_words = lang_dataset.stopwords

    # Peform word tokenization
    vectorizer = CountVectorizer(tokenizer=word_tokenize,stop_words=stop_words)
    model = Pipeline([
        ('vectorizer', vectorizer),  # word-level tokenizer by default
        ('classifier', MultinomialNB())
    ])
    # Train model
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    word_result: dict = classification_report(y_test, y_pred, output_dict=True)


    # SentencePiece tokenizer
    spm.SentencePieceTrainer.Train(input='tweets.txt', model_prefix='lang_model', vocab_size=8000, model_type='bpe')
    sp = spm.SentencePieceProcessor()
    sp.load('lang_model.model')

    def encode_as_bow(tweets, vocab_size):
        rows, cols, data = [], [], []
        for i, tweet in enumerate(tweets):
            ids = sp.encode(tweet, out_type=int)
            for idx in ids:
                rows.append(i)
                cols.append(idx)
                data.append(1)
        return csr_matrix((data, (rows, cols)), shape=(len(tweets), vocab_size))

    X_yor_train = encode_as_bow(X_train, vocab_size=8000)
    X_yor_test = encode_as_bow(X_test, vocab_size=8000)

    clf = MultinomialNB()
    clf.fit(X_yor_train, y_train)

    y_pred = clf.predict(X_yor_test)

    spm_result: dict = classification_report(y_test, y_pred, output_dict=True)
    # loss = log_loss(y_test, y_pred)
    # print(f"Loss for {lang} Language: {loss:.3f}")
    
    compare_results(word_result, spm_result)
    delete_files()

    

In [151]:
analyze_text(Languages.YORUBA)
analyze_text(Languages.HAUSA)
analyze_text(Languages.IGBO)
analyze_text(Languages.NIGERIAN_PIDGIN)



ValueError: could not convert string to float: np.str_('positive')

In [None]:
# yor_dataset: SplitSet = ns_dataset.get(Languages.YORUBA)
# X_train = yor_dataset.train["tweet"]
# yor_dataset.train["tweet"].to_csv('tweets.txt', index=False, header=False)
# y_train = yor_dataset.train["label"]

# X_test = yor_dataset.test["tweet"]
# y_test = yor_dataset.test["label"]
# stop_words = yor_dataset.stopwords

In [None]:
# Train the SentencePiece model
# spm.SentencePieceTrainer.Train(input='tweets.txt', model_prefix='yoruba', vocab_size=8000, model_type='bpe')

In [None]:
# sp = spm.SentencePieceProcessor()
# sp.load('yoruba.model')

# def encode_as_bow(tweets, vocab_size):
#     rows, cols, data = [], [], []
#     for i, tweet in enumerate(tweets):
#         ids = sp.encode(tweet, out_type=int)
#         for idx in ids:
#             rows.append(i)
#             cols.append(idx)
#             data.append(1)
#     return csr_matrix((data, (rows, cols)), shape=(len(tweets), vocab_size))

# X_yor_train = encode_as_bow(X_train, vocab_size=8000)
# X_yor_test = encode_as_bow(X_test, vocab_size=8000)

In [None]:
# 6. Build pipeline
# pipeline = Pipeline([
#     ('sp_vectorizer', SentencePieceVectorizer(model_file='yoruba.model', vocab_size=2000)),
#     ('classifier', MultinomialNB())
# ])




In [None]:
# pipeline.fit(X_yor_train, y_train)
# clf = MultinomialNB()
# clf.fit(X_yor_train, y_train)

In [None]:
# y_pred = clf.predict(X_yor_test)

In [None]:
# print(classification_report(y_test, y_pred))

In [None]:
# vectorizer = CountVectorizer(tokenizer=word_tokenize,stop_words=stop_words)
# model = Pipeline([
#     ('vectorizer', vectorizer),  # word-level tokenizer by default
#     ('classifier', MultinomialNB())
# ])

# # Train model
# model.fit(X_train, y_train)



In [None]:

# # Predict and evaluate
# y_pred = model.predict(X_test)
# print(classification_report(y_test, y_pred))