In [1]:
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
df = pd.read_csv("/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")

In [4]:
df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
df.columns

Index(['Category', 'Message'], dtype='object')

In [9]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

# This dataset in imbalanced, as ham are 4825 and spam are 747. So, it Complement NB is suitable for this.  

In [10]:
X = df['Message']
y = df['Category']

In [13]:
len(X)

5572

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y , test_size= 0.2, random_state= 22)

In [15]:
len(X_train)

4457

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [23]:
pipeMNB = Pipeline([('tfidf', TfidfVectorizer()),('clf',MultinomialNB())])
pipeCNB = Pipeline([('tfidf', TfidfVectorizer()),('clf',ComplementNB())])
pipeSVC = Pipeline([('tfidf', TfidfVectorizer()),('clf',LinearSVC())])

# Predicted Accuracy Score MNB

In [25]:
pipeMNB.fit(X_train, y_train)
predictMNB = pipeMNB.predict(X_test)
predictMNB

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [28]:
print(f'MNB: {accuracy_score(y_test,predictMNB):.2f}')

MNB: 0.95


# Predicted Accuracy Score CNB

In [29]:
pipeCNB.fit(X_train, y_train)
predictCNB = pipeCNB.predict(X_test)
predictCNB

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'ham'], dtype='<U4')

In [30]:
print(f'CNB: {accuracy_score(y_test,predictCNB):.2f}')

CNB: 0.98


# Predicted Accuracy Score SVC

In [31]:
pipeSVC.fit(X_train, y_train)
predictSVC = pipeSVC.predict(X_test)
predictSVC

array(['ham', 'ham', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [32]:
print(f'SVC: {accuracy_score(y_test,predictSVC):.2f}')

SVC: 0.99


# Classification Report 

In [36]:
print(classification_report(y_test, predictMNB))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.90      0.94       150

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [37]:
print(classification_report(y_test, predictCNB))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       965
        spam       0.94      0.89      0.91       150

    accuracy                           0.98      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [38]:
print(classification_report(y_test, predictSVC))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.90      0.94       150

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115



# Test Model on random message

In [39]:
msg = 'You have won the $10000 price. Contact us to collect your amount'

In [41]:
MNB = pipeMNB.predict([msg])
print(MNB)

['ham']


In [42]:
CNB = pipeCNB.predict([msg])
print(CNB)

['spam']


In [43]:
SVC = pipeSVC.predict([msg])
print(SVC)

['spam']
