# Text Classification using ML

In [15]:
# Import LIBs
import joblib
import spacy
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Load Spacy Model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Read Data
df = pd.read_csv(r"SPAM text message 20170820 - Data.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# EDA & Data Preprocessing

In [None]:
# Data Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
# Check How Many Categories We Have?
df["Category"].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [None]:
# Split Data Into Text & Labels
text = df["Message"]
labels = df["Category"]

In [None]:
# Split Data into Train & Test
xtrain, xtest, ytrain, ytest = train_test_split(text, labels, test_size = 0.2,
                                                stratify = labels, random_state = 42)
print(f"Train text shape : {xtrain.shape}")
print(f"Test text shape : {xtest.shape}")
print(f"Test labels shape : {ytrain.shape}")
print(f"Test labels shape : {ytest.shape}")

Train text shape : (4457,)
Test text shape : (1115,)
Test labels shape : (4457,)
Test labels shape : (1115,)


In [None]:
# Create a Pipelines with Different Algorithms
pl_MNB = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words = "english")),
    ("clf", MultinomialNB())
])

pl_CNB = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words = "english")),
    ("clf", ComplementNB())
])

pl_SVC = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words = "english")),
    ("clf", LinearSVC())
])


In [27]:
# Fit the MultinomialNB
pl_MNB.fit(xtrain, ytrain)
MNB_pred = pl_MNB.predict(xtest)
print("MultinomialNB results")
print("Confusion matrix : \n", confusion_matrix(ytest, MNB_pred))
print("Classification report : \n", classification_report(ytest, MNB_pred))

MultinomialNB results
Confusion matrix : 
 [[966   0]
 [ 34 115]]
Classification report : 
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [28]:
# Fit the ComplementNB
pl_CNB.fit(xtrain, ytrain)
CNB_pred = pl_CNB.predict(xtest)
print("ComplementNB results")
print("Confusion matrix : \n", confusion_matrix(ytest, CNB_pred))
print("Classification report : \n", classification_report(ytest, CNB_pred))

ComplementNB results
Confusion matrix : 
 [[940  26]
 [  8 141]]
Classification report : 
               precision    recall  f1-score   support

         ham       0.99      0.97      0.98       966
        spam       0.84      0.95      0.89       149

    accuracy                           0.97      1115
   macro avg       0.92      0.96      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [29]:
# Fit the LinearSVC
pl_SVC.fit(xtrain, ytrain)
SVC_pred = pl_SVC.predict(xtest)
print("LinearSVC results")
print("Confusion matrix : \n", confusion_matrix(ytest, SVC_pred))
print("Classification report : \n", classification_report(ytest, SVC_pred))

LinearSVC results
Confusion matrix : 
 [[964   2]
 [ 18 131]]
Classification report : 
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       0.98      0.88      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



##### In our case, misclassifying ham emails as spam is more costly, and both false positives and false negatives matter.

###### MultinomialNB performs well on ham but poorly detects spam.

###### ComplementNB detects spam well but misclassifies many ham emails.

###### LinearSVC offers a balanced trade-off, with low false positives and good spam detection.

##### Thus, LinearSVC is the most appropriate choice.

In [13]:
# Test the linearsvc
new_msg = "you have won a $100000 prize, contact us for the reword"
new_msg_pred = pl_SVC.predict([new_msg])
new_msg_pred

array(['spam'], dtype=object)

In [30]:
# Save Best Model
joblib.dump(pl_SVC, "spam_classifier_LSVC.pkl")
print("Model Saved as 'spam_classifier_LSVC.pkl")

Model Saved as 'spam_classifier_LSVC.pkl
