In [1]:
import pandas as pd

In [8]:
import re

In [16]:
pd.set_option('display.max_colwidth', None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_excel('dataset.xlsx')

#### Removing numbers at the end of sentences

In [10]:
def clean_text(ujumbe):
    match = re.search(r'(\b\d{1,4})[.,]?\s*$', ujumbe)
    if match:
        num = int(match.group(1))
        if num < 2500:
            return re.sub(r'\s*\b\d{1,4}[.,]?\s*$', '', ujumbe).strip()
    return ujumbe

In [11]:
df['ujumbe'] = df['ujumbe'].apply(clean_text)

In [17]:
print(df['ujumbe'])

0                           malipo yako yamezuiwa bonyeza link hii kuthibitisha verifynow0tz
1                                                nimepata ujumbe wako wa jana nashukuru sana
2                                     tuma tsh 5000 kwa 0641743818 ili ushinde bahati nasibu
3                                                habari ndugu natumai uko salama siku ya leo
4       malipo yamekataa kutokana na hitilafu ya mfumo tuma tsh 20000 kwa akaunti 0670798857
                                                ...                                         
4994                                              mama amesema tukutane nyumbani saa 7 jioni
4995                     jiunge na freemason upate utajiri na mafanikio piga 0750417795 sasa
4996                                       nimefanikiwa kufika salama asante kwa maombi yako
4997                     jiunge na freemason upate utajiri na mafanikio piga 0767152059 sasa
4998                                       nimefanikiwa kufika salama 

#### Split the data into training and test sets

In [18]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['aina'], random_state=42)

#### Initialize CountVectorizer

In [19]:
vectorizer = CountVectorizer()

#### Fit and transform the training data (using 'ujumbe' column)

In [20]:
X_train = vectorizer.fit_transform(train_df['ujumbe'])
X_test = vectorizer.transform(test_df['ujumbe'])

#### Get the labels

In [21]:
y_train = train_df['aina']
y_test = test_df['aina']

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Train the Multinomial Naive Bayes model

In [23]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

### Make predictions on the test set

In [24]:
y_pred = nb_model.predict(X_test)

### Evaluate the model

In [25]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [27]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00       500
        spam       1.00      1.00      1.00       500

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



### Display confusion matrix

In [28]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=['ham', 'spam']))


Confusion Matrix:
[[500   0]
 [  0 500]]


### Cross-Validation

In [29]:
from sklearn.model_selection import cross_val_score
nb_model = MultinomialNB()
scores = cross_val_score(nb_model, vectorizer.fit_transform(df['ujumbe']), df['aina'], cv=5, scoring='accuracy')
print("Cross-Validation Accuracy:", scores.mean(), "+/-", scores.std())

Cross-Validation Accuracy: 0.9996 +/- 0.0008000000000000006


### Check for Data Leakage

#### The dataset is still not clean and its giving accuracy of 100% since there are so many duplicates we need to clean The Dataset and Retrain the Model

In [32]:
print("Duplicate messages:", df['ujumbe'].duplicated().sum())

Duplicate messages: 2462


#### Remove duplicates

In [33]:
df = df.drop_duplicates(subset='ujumbe')

In [34]:
print("Number of rows after removing duplicates:", len(df))

Number of rows after removing duplicates: 2537


#### Repeating the same procedures 

##### Split the data

In [35]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['aina'], random_state=42)

#### Apply Bag of Words using CountVectorizer

In [36]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['ujumbe'])
X_test = vectorizer.transform(test_df['ujumbe'])
y_train = train_df['aina']
y_test = test_df['aina']

#### Train the Multinomial Naive Bayes model

In [37]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

#### Evaluate the model

In [38]:
y_pred = nb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=['ham', 'spam']))

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         8
        spam       1.00      1.00      1.00       500

    accuracy                           1.00       508
   macro avg       1.00      1.00      1.00       508
weighted avg       1.00      1.00      1.00       508


Confusion Matrix:
[[  8   0]
 [  0 500]]


#### Predict on new messages

In [44]:
new_messages = ["tuma tsh 100 upate bonus 5000", "habari, nimefika salama"]
new_messages_vector = vectorizer.transform(new_messages)
predictions = nb_model.predict(new_messages_vector)
print("\nPredictions for new messages:", predictions)


Predictions for new messages: ['spam' 'ham']


In [69]:
new_messages = ["au iyo ela nitumie kwenye AirtelMoney hii O695189851 Jina litoke JACOBO FIKIRI", "Jiunge na freemason"]
new_messages_vector = vectorizer.transform(new_messages)
predictions = nb_model.predict(new_messages_vector)
print("\nPredictions for new messages:", predictions)


Predictions for new messages: ['spam' 'spam']


In [55]:
new_messages = ["malipo yamekataa kutokana na hitilafu ya mfumo tuma tsh 10000 kwa akaunti 0625115068", "mambo ndugu yangu nimekumiss sana"]
new_messages_vector = vectorizer.transform(new_messages)
predictions = nb_model.predict(new_messages_vector)
print("\nPredictions for new messages:", predictions)


Predictions for new messages: ['spam' 'ham']


#### The Dataset need to be improved and use the same procedure to retrain the model

#### Deleting the token Column and Downloading the new dataset 

#### Downloading

In [59]:
df.to_csv('dataset2.csv', index=False)

In [60]:
print("Columns after dropping 'tokens':", df.columns.tolist())
print(df.head())

Columns after dropping 'tokens': ['ujumbe', 'aina', 'tokens']
                                                                                 ujumbe  \
0                      malipo yako yamezuiwa bonyeza link hii kuthibitisha verifynow0tz   
1                                           nimepata ujumbe wako wa jana nashukuru sana   
2                                tuma tsh 5000 kwa 0641743818 ili ushinde bahati nasibu   
3                                           habari ndugu natumai uko salama siku ya leo   
4  malipo yamekataa kutokana na hitilafu ya mfumo tuma tsh 20000 kwa akaunti 0670798857   

   aina  \
0  spam   
1   ham   
2  spam   
3   ham   
4  spam   

                                                                                                                         tokens  
0                                     ['malipo', 'yako', 'yamezuiwa', 'bonyeza', 'link', 'hii', 'kuthibitisha', 'verifynow0tz']  
1                                                        ['nimep

#### Droping the Token Column

In [61]:
df = df.drop('tokens', axis=1)

In [62]:
print("Columns after dropping 'tokens':", df.columns.tolist())
print(df.head())

Columns after dropping 'tokens': ['ujumbe', 'aina']
                                                                                 ujumbe  \
0                      malipo yako yamezuiwa bonyeza link hii kuthibitisha verifynow0tz   
1                                           nimepata ujumbe wako wa jana nashukuru sana   
2                                tuma tsh 5000 kwa 0641743818 ili ushinde bahati nasibu   
3                                           habari ndugu natumai uko salama siku ya leo   
4  malipo yamekataa kutokana na hitilafu ya mfumo tuma tsh 20000 kwa akaunti 0670798857   

   aina  
0  spam  
1   ham  
2  spam  
3   ham  
4  spam  


In [63]:
df.to_csv('dataset2.csv', index=False)