In [4]:
import csv
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
datafile = pd.read_csv('swahili.csv')


In [6]:
datafile.head()

Unnamed: 0,text,labels
0,chaguo nzuri,positive
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive
2,napenda bidhaa hii,positive
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive
4,ninavaa kila siku na inashikilia vizuri sana,positive


In [7]:
#url removal
datafile['text'] = datafile['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [8]:
#special character removal
datafile['text'] = datafile['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  

In [9]:
#convert to lowercase
datafile = datafile.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

In [10]:
#tokenize
datafile['tokens'] = datafile['text'].apply(nltk.word_tokenize)
print(datafile)


                                                   text    labels  \
0                                          chaguo nzuri  positive   
1              kifaa hiki ni nzuri katika hali kadhaa 1  positive   
2                                    napenda bidhaa hii  positive   
3     ninapenda simu hii ni nzuri sana na ina sifa n...  positive   
4          ninavaa kila siku na inashikilia vizuri sana  positive   
...                                                 ...       ...   
3092  nafikiri chakula chapasa kuwa na ladha na umbi...  negative   
3093                   hamu ya kula ilitoweka mara moja  negative   
3094            kwa ujumla sikuvutiwa na nisirudi nyuma  negative   
3095  mambo yote yaliyoonwa yalikuwa chini ya kiwang...  negative   
3096  basi ni kana kwamba nilipoteza maisha yangu ya...  negative   

                                                 tokens  
0                                       [chaguo, nzuri]  
1     [kifaa, hiki, ni, nzuri, katika, hali, kadhaa, 1]

In [11]:
#removestopwords
from nltk.corpus import stopwords

def stopword_removal(tokens):
    swa_stopwords =  stopwords.words('swahili')
    clean_tokens = [token for token in tokens if token.lower() not in swa_stopwords]
    return clean_tokens
datafile['tokens'] = datafile['tokens'].apply(stopword_removal)
datafile.head()
        

Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
datafile.head()

Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [16]:
#datatraining
X_train, X_test, y_train, y_test = train_test_split(datafile['text'], datafile['labels'], test_size = 0.2, random_state = 42)
datafile.head()

Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [19]:
#vectorization
vectorizer = CountVectorizer(stop_words = 'swahili')
swa_stopwords = ('swahili')
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
datafile.head()

Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [20]:
#train a naves bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)
datafile.head()

Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [21]:
#classifier evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


y_pred = clf.predict(X_test_vec)


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred, average='macro')
print("Precision:", precision)


recall = recall_score(y_test, y_pred, average='macro')
print("Recall:", recall)


f1_score = f1_score(y_test, y_pred, average='macro')
print("F1-score:", f1_score)
datafile.head()

Accuracy: 0.7548387096774194
Precision: 0.7575805987794357
Recall: 0.7521030740844983
F1-score: 0.7525210084033613


Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."


In [27]:
#train the SVM model
from sklearn.svm import SVC

svm_clf = SVC(kernel='linear', C=1, max_iter=30000)


svm_clf.fit(X_train_vec, y_train)


svm_acc = svm_clf.score(X_test_vec, y_test)

print("Accuracy of SVM classifier: {:.2f}%".format(svm_acc * 100))
datafile.head(1)

ValueError: Found input variables with inconsistent numbers of samples: [2477, 105]

In [24]:
#Evaluation of SVM
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=44)

svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
datafile.head()

Confusion Matrix:
[[17  0  0]
 [ 0 13  0]
 [ 0  1 14]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       0.93      1.00      0.96        13
           2       1.00      0.93      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45



Unnamed: 0,text,labels,tokens
0,chaguo nzuri,positive,"[chaguo, nzuri]"
1,kifaa hiki ni nzuri katika hali kadhaa 1,positive,"[kifaa, hiki, nzuri, katika, hali, kadhaa, 1]"
2,napenda bidhaa hii,positive,"[napenda, bidhaa, hii]"
3,ninapenda simu hii ni nzuri sana na ina sifa n...,positive,"[ninapenda, simu, hii, nzuri, sana, na, ina, s..."
4,ninavaa kila siku na inashikilia vizuri sana,positive,"[ninavaa, kila, siku, na, inashikilia, vizuri,..."
