<a href="https://colab.research.google.com/github/NadhemBenhadjali/-Swahili-News-Classification-LLM-Finetuning-Multiclassification/blob/main/applying%20machine%20learning%20models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import important modules
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # classifier

from sklearn.metrics import log_loss #evaluation metric
from sklearn.feature_extraction.text import CountVectorizer

# text preprocessing modules
import re
from string import punctuation

import warnings
warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)


In [None]:
# load data
path = ''
train = pd.read_csv(path+"/content/Train (11).csv")
test = pd.read_csv(path+"/content/Test (7).csv")

In [None]:
stopword=["akasema","alikuwa"," alisema","baada","basi","bila","cha","chini","hadi","hapo","hata","hivyo","hiyo","huku","huo","ili","ilikuwa","juu","kama","karibu","katika","kila","kima","kisha","kubwa","kutoka","kuwa","kwa","kwamba","kwenda","kwenye ","la","lakini","mara","mdogo","mimi","mkubwa","mmoja","moja","muda","mwenye","na","naye","ndani","ng","ni","nini","nonkungu","pamoja","pia","sana","sasa","sauti","tafadhali","tena","tu","vile","wa","wakati","wake","walikuwa","wao","watu","wengine","wote","ya","yake","yangu","yao","yeye","yule ","za","zaidi","zake"]


In [None]:
# show top five rows of train data
train.head()

Unnamed: 0,id,content,category
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,Kitaifa
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",Biashara
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,Kitaifa
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,michezo
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,Kitaifa


In [None]:
# show top five rows of test data
test.head()

Unnamed: 0,swahili_id,content
0,ae3baa6c34aa523fd2aa4de3c89448efff922311,Rais John Magufuli amemuagiza Msajili wa Hazi...
1,c4ee26a3ade8064a2ec494996e836900fd32dd8e,TAHARUKI imezuka katika mkutano wa Naibu Wazi...
2,58aee3aa1d94554ff57e6a053dbd60658e4890ff,"KOCHA wa Azam FC ya Dar es Salaam, Idd Cheche..."
3,00579c2307b5c11003d21c40c3ecff5e922c3fd8,THAMANI ya mauzo ya bidhaa za Afrika Masharik...
4,c83e9738ae5d1790ee85b99863deb734e7614c52,"WAZIRI wa Nchi, Ofi si ya Makamu wa Rais, Muu..."


In [None]:
test.shape

(1030, 2)

In [None]:
# check the shape of the train data
train.shape

(5151, 3)

In [None]:
# check the shape of the test data
test.shape

(1030, 2)

In [None]:
# check missing values in train data
train.isnull().sum()

id          0
content     0
category    0
dtype: int64

In [None]:
# check missing values in test data
test.isnull().sum()

swahili_id    0
content       0
dtype: int64

In [None]:
# evalute news category distribution
train.category.value_counts()

category
Kitaifa      2000
michezo      1720
Biashara     1360
Kimataifa      54
Burudani       17
Name: count, dtype: int64

### Data Preparation

In [None]:
# a mapping dictionary that maps the category values from 0 to 5
category_mapping = {
"Kitaifa": 0,
"michezo": 1,
"Biashara": 2,
"Kimataifa": 3,
"Burudani": 4,
}

train["category"] = train.category.map(category_mapping)

train.head()

Unnamed: 0,id,content,category
0,SW0,SERIKALI imesema haitakuwa tayari kuona amani...,0
1,SW1,"Mkuu wa Mkoa wa Tabora, Aggrey Mwanri amesiti...",2
2,SW10,SERIKALI imetoa miezi sita kwa taasisi zote z...,0
3,SW100,KAMPUNI ya mchezo wa kubahatisha ya M-bet ime...,1
4,SW1000,WATANZANIA wamekumbushwa kusherehekea sikukuu...,0


In [None]:
# a simple function to clean text data
#ps = nltk.PorterStemmer()
#from nltk.tokenize import word_tokenize
def text_cleaning(text):

    # Clean the text data
    text = re.sub(r"[^A-Za-z0-9]", " ", text) # remove punctuation marks
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)# remove single character in a sentence.
    text = re.sub(r'\s+', ' ', text, flags=re.I) # Removing double spaces in the documents.
    text = text.lower()  # set in lowercase
    text = re.sub(r'^b\s+', '', text) # remove the prefix
    text =  [c for c in text if c not in stopword]  # removing all the stopwords in the sentences
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    #text = [ps.stem(text) for text in text] # Stemming in its the original form
    return(text)

In [None]:
#clean the train and test data
train["content"] = train["content"].apply(text_cleaning)
test["content"] = test["content"].apply(text_cleaning)

In [None]:
#split features and target from train data
X = train["content"]
y = train.category.values

In [None]:
# Transform text data
vectorizer = CountVectorizer(lowercase=False)

vectorizer.fit(X)

#transform train data
X_transformed = vectorizer.transform(X)

#transform test data
test_transformed = vectorizer.transform(test["content"])

In [None]:
X_transformed

<5151x73016 sparse matrix of type '<class 'numpy.int64'>'
	with 859943 stored elements in Compressed Sparse Row format>

In [None]:
# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X_transformed,
    y,
    test_size=0.20,
    random_state=46,
    shuffle=True,
    stratify=y,
)

In [None]:
# import numpy as np
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.utils import to_categorical

# # Constants
# MAX_SEQUENCE_LENGTH = 500  # Adjust as needed

# # Splitting the data
# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_transformed,
#     y,
#     test_size=0.20,
#     random_state=46,
#     shuffle=True,
#     stratify=y,
# )

# # Convert sparse matrix to dense matrix
# X_train_dense = X_train.toarray()
# X_valid_dense = X_valid.toarray()

# # Pad sequences to a fixed length
# X_train_padded = pad_sequences(X_train_dense, maxlen=MAX_SEQUENCE_LENGTH)
# X_valid_padded = pad_sequences(X_valid_dense, maxlen=MAX_SEQUENCE_LENGTH)

# # Convert labels to categorical if not already
# y_train_cat = to_categorical(y_train)
# y_valid_cat = to_categorical(y_valid)

# # Define the model
# model = Sequential()
# model.add(Embedding(input_dim=5000, output_dim=128, input_length=MAX_SEQUENCE_LENGTH))
# model.add(SpatialDropout1D(0.2))
# model.add(LSTM(100))  # CuDNN-compatible LSTM
# model.add(Dense(len(np.unique(y)), activation='softmax'))

# # Compile the model
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Print the model summary
# print(model.summary())

# # Train the model
# history = model.fit(
#     X_train_padded,
#     y_train_cat,
#     epochs=10,
#     batch_size=32,  # Reduce batch size
#     validation_data=(X_valid_padded, y_valid_cat),
#     verbose=1
# )

# # Evaluate the model
# loss, accuracy = model.evaluate(X_valid_padded, y_valid_cat, verbose=0)
# print(f'Validation Accuracy: {accuracy:.4f}')


### Create Classifier

In [None]:
# Create a classifier
import xgboost as xgb
from xgboost import XGBClassifier

# Initialize the XGBoost classifier
news_classifier = XGBClassifier()


In [None]:
# train the news_classifier
news_classifier.fit(X_train,y_train)

In [None]:
# test model performance on valid data
y_probas = news_classifier.predict_proba(X_valid)

In [None]:
# evalute model performance by using log_loss in the validation data
log_loss(y_valid, y_probas)

In [None]:
# create prediction from the test data
test_probas = news_classifier.predict_proba(test_transformed)

### Create Submission File

In [None]:
# create submission file
submission_cols = ["Kitaifa",
"michezo" ,
"Biashara" ,
"Kimataifa" ,
"Burudani"  ,
]
submission_df = pd.DataFrame(test_probas, columns = submission_cols)
submission_df['test_id'] = test['swahili_id']   # add  test_id

#rearange columns
submission_df = submission_df[['test_id',"Kitaifa",
"michezo" ,
"Biashara" ,
"Kimataifa" ,
"Burudani"  ,     ]]

# save submission file
submission_df.to_csv(path+"first_submission.csv",index=False)

Trying another model to evaluate:

In [None]:
pip install xgboost


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import numpy as np

# Ensure your data is of the correct type
X_train = X_train.astype(np.float32)
X_valid = X_valid.astype(np.float32)
test_transformed = test_transformed.astype(np.float32)

# Initialize the classifiers
xgb_classifier = XGBClassifier()
catboost_classifier = CatBoostClassifier(verbose=0)
lgbm_classifier = LGBMClassifier()

# Train the classifiers
lgbm_classifier.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)
catboost_classifier.fit(X_train, y_train)


In [None]:
# Predict probabilities on the validation set
xgb_probas = xgb_classifier.predict_proba(X_valid)
catboost_probas = catboost_classifier.predict_proba(X_valid)
lgbm_probas = lgbm_classifier.predict_proba(X_valid)

# Predict probabilities on the test set
xgb_test_probas = xgb_classifier.predict_proba(test_transformed)
catboost_test_probas = catboost_classifier.predict_proba(test_transformed)
lgbm_test_probas = lgbm_classifier.predict_proba(test_transformed)



In [None]:
# Average the probabilities from each model
ensemble_probas = (xgb_probas + catboost_probas + lgbm_probas) / 3
ensemble_test_probas = (xgb_test_probas + catboost_test_probas + lgbm_test_probas) / 3


In [None]:
# create submission file
submission_cols = ["Kitaifa",
"michezo" ,
"Biashara" ,
"Kimataifa" ,
"Burudani"  ,
]
submission_df = pd.DataFrame(lgbm_test_probas, columns = submission_cols)
submission_df['test_id'] = test[content']   # add  test_id

#rearange columns
submission_df = submission_df[['test_id',"Kitaifa",
"michezo" ,
"Biashara" ,
"Kimataifa" ,
"Burudani"  ,     ]]


# save submission file
submission_df.to_csv(path+"submission_model2.csv",index=False)