In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from collections import  Counter
import nltk
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# Load and preprocess the data
data = pd.read_csv("/content/spam.csv", encoding="latin-1")
data=data[['v1','v2']]
data=data.drop_duplicates()

# Calculate the shape of the preprocessed data
print("Shape of the data:", data.shape)

Shape of the data: (5169, 2)


In [3]:
data["v1"].value_counts()

ham     4516
spam     653
Name: v1, dtype: int64

In [4]:
# Encode labels using LabelEncoder
le= LabelEncoder()
data["v1"] = le.fit_transform(data["v1"])
df = data["v2"].apply(lambda x:len( nltk.word_tokenize(x)))

all_stopwords=stopwords.words('english')

# Define a function to preprocess text data
def final_form(text):
    text = text.lower()               #  Converts Text in Lower Case
    text = nltk.word_tokenize(text)   #  Breaks Text in Words

    y = []
    for i in text:
        if i.isalnum():               #  Removing Special Characters
            y.append(i)
    text = y[:]
    y.clear()

    for i in text:                    #  Removing Stopwords and Punctuation
        if i not in stopwords.words('english') and i not in string.punctuation:
          y.append(i)

    text = y[:]
    y.clear()
    for i in text:                    #  Porter Stemmer removing unwanted words
        y.append(ps.stem(i))
    return " ".join(y)

# Apply the preprocessing function to the text data
data['v2'] = data['v2'].apply(lambda x: final_form(x))

In [5]:
# Create TF-IDF vectors from text data
tfidfv = TfidfVectorizer(max_features=df.max())
X = tfidfv.fit_transform(data["v2"]).toarray()
y=data['v1'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

# Initialize and train machine learning models
# 1. Logistic Regression
model1=LogisticRegression()
model1.fit(X_train,y_train)
# Evaluating model's accuracy and precision
y_pred1=model1.predict(X_test)
print("Logistic Regression Accuracy Score:",accuracy_score(y_test,y_pred1))
print("Logistic Regression Precision Score:",precision_score(y_test,y_pred1))


# 2. Support Vector Machine (SVM)
model2 = SVC()
model2.fit(X_train, y_train)
# Evaluating model's accuracy and precision
y_pred2 = model2.predict(X_test)
print("SVM Accuracy Score:", accuracy_score(y_test, y_pred2))
print("SVM Precision Score:", precision_score(y_test, y_pred2))

# Decision Tree Classifier
model3=DecisionTreeClassifier()
model3.fit(X_train,y_train)
y_pred3=model3.predict(X_test)
print("Decision Tree Accuracy Score:",accuracy_score(y_test,y_pred3))
print("Decision Tree Precision Score:",precision_score(y_test,y_pred3))

# K-Neighbors Classifier
model4=KNeighborsClassifier()
model4.fit(X_train,y_train)
y_pred4=model4.predict(X_test)
print("KNeighbors Accuracy Score:",accuracy_score(y_test,y_pred4))
print("KNeighbors Precision Score:",precision_score(y_test,y_pred4))

# Random Forest Classifier
model5=RandomForestClassifier()
model5.fit(X_train,y_train)
y_pred5=model5.predict(X_test)
print("Random Forest Accuracy Score:",accuracy_score(y_test,y_pred5))
print("Random Forest Precision Score:",precision_score(y_test,y_pred5))

# AdaBoost Classifier
model6=AdaBoostClassifier()
model6.fit(X_train,y_train)
y_pred6=model6.predict(X_test)
print("AdaBoost Accuracy Score:",accuracy_score(y_test,y_pred6))
print("AdaBoost Precision Score:",precision_score(y_test,y_pred6))

# Gradient Boosting Classifier
model7=GradientBoostingClassifier()
model7.fit(X_train,y_train)
y_pred7=model7.predict(X_test)
print("GradientBoosting Accuracy Score:",accuracy_score(y_test,y_pred7))
print("GradientBoosting Precision Score:",precision_score(y_test,y_pred7))

# XGB Classifier
model8=XGBClassifier()
model8.fit(X_train,y_train)
y_pred8=model8.predict(X_test)
print("XGBClassifier Accuracy Score:",accuracy_score(y_test,y_pred8))
print("XGBClassifier Precision Score:",precision_score(y_test,y_pred8))

# XGBRF Classifier
model9=XGBRFClassifier()
model9.fit(X_train,y_train)
y_pred9=model9.predict(X_test)
print("XGBRFClassifier Accuracy Score:",accuracy_score(y_test,y_pred9))
print("XGBRFClassifier Precision Score:",precision_score(y_test,y_pred9))

Logistic Regression Accuracy Score: 0.9613152804642167
Logistic Regression Precision Score: 0.9454545454545454
SVM Accuracy Score: 0.9671179883945842
SVM Precision Score: 0.9727272727272728
Decision Tree Accuracy Score: 0.9506769825918762
Decision Tree Precision Score: 0.8222222222222222
KNeighbors Accuracy Score: 0.9535783365570599
KNeighbors Precision Score: 0.9787234042553191
Random Forest Accuracy Score: 0.9680851063829787
Random Forest Precision Score: 0.9338842975206612
AdaBoost Accuracy Score: 0.960348162475822
AdaBoost Precision Score: 0.8943089430894309
GradientBoosting Accuracy Score: 0.9593810444874274
GradientBoosting Precision Score: 0.9444444444444444
XGBClassifier Accuracy Score: 0.9709864603481625
XGBClassifier Precision Score: 0.95
XGBRFClassifier Accuracy Score: 0.9429400386847195
XGBRFClassifier Precision Score: 0.8910891089108911


In [6]:
# Conclusion
print("\nModel evaluation completed.")
print("Here are the evaluation results for each model:")

# Print header
print("{:<30} {:<30} {:<30}".format("Model", "Accuracy", "Precision"))

# Print evaluation results for each model
model_names = ["Logistic Regression", "SVM", "Decision Tree", "KNeighbors", "Random Forest",
               "AdaBoost", "Gradient Boosting", "XGBClassifier", "XGBRFClassifier"]
y_preds = [y_pred1, y_pred2, y_pred3, y_pred4, y_pred5, y_pred6, y_pred7, y_pred8, y_pred9]
accuracy_scores = [accuracy_score(y_test, y_pred) for y_pred in y_preds]
precision_scores = [precision_score(y_test, y_pred) for y_pred in y_preds]

for i in range(len(model_names)):
    print("{:<30} {:<30} {:<30}".format(model_names[i], accuracy_scores[i], precision_scores[i]))


# Summary: We conducted a text classification experiment to predict spam or ham messages using various machine learning models.
# We preprocessed the text data by tokenizing, removing stopwords and punctuation, and applying stemming.
# TF-IDF vectors were created from the processed text. We trained models including Logistic Regression, SVM, Decision Tree, etc.,
# and evaluated their accuracy and precision on a test dataset. The achieved results provide insights into the effectiveness of
# different models for this task.


Model evaluation completed.
Here are the evaluation results for each model:
Model                          Accuracy                       Precision                     
Logistic Regression            0.9613152804642167             0.9454545454545454            
SVM                            0.9671179883945842             0.9727272727272728            
Decision Tree                  0.9506769825918762             0.8222222222222222            
KNeighbors                     0.9535783365570599             0.9787234042553191            
Random Forest                  0.9680851063829787             0.9338842975206612            
AdaBoost                       0.960348162475822              0.8943089430894309            
Gradient Boosting              0.9593810444874274             0.9444444444444444            
XGBClassifier                  0.9709864603481625             0.95                          
XGBRFClassifier                0.9429400386847195             0.8910891089108911      

In [7]:
# Load the dataset
import pandas as pd
df=messages = pd.read_csv('/content/SMSSpamCollection.txt', sep='\t',
                           names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Extract features and labels
X=list(df['message'])
y=list(df['label'])
y=list(pd.get_dummies(y,drop_first=True)['spam'])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [9]:
# Install the 'transformers' library
! pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.2 MB/s[0m eta [36m0:00:0

In [10]:
# Initialize the DistilBERT tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
# Tokenize and encode the training and testing data
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [12]:
# Create TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
))

In [13]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

# Define training arguments for TFTrainer
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=2,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [14]:
# Build the model using DistilBERT for sequence classification
with training_args.strategy.scope():
    model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Initialize optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

# Define the loss function
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])

# Fit the dataset to model for training
model.fit(train_dataset.shuffle(1000).batch(16), epochs=2, batch_size=16)



Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7efa537be0e0>

In [15]:
from sklearn.metrics import confusion_matrix, f1_score

# Predict on test dataset
y_pred = model.predict(test_dataset.batch(16))
y_pred = tf.argmax(y_pred.logits, axis=1).numpy()

# Calculate confusion matrix
# The confusion matrix shows the counts of true positive (TP),
# true negative (TN), false positive (FP), and false negative (FN) predictions.
#
#
conf_matrix = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:",conf_matrix)

# Calucate the f1 score from the Confusion matrix
# formula : 2 * (precision * recall) / (precision + recall)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)
# Summary: In this code, we perform text classification using the DistilBERT model from the Hugging Face Transformers library.
# We preprocess the SMS spam dataset, split it into train and test sets, and use the DistilBERT tokenizer to encode the text.
# Then, we create TensorFlow Datasets and define training arguments for the TFTrainer. The model is compiled and trained using
# the training data, and finally, it's evaluated on the test dataset. The code showcases how to use pre-trained transformer models
# for sequence classification tasks.

Confusion Matrix: [[953   2]
 [  4 156]]
F1 Score: 0.9811320754716981


In [16]:
# Save the model
model.save('/content/drive/MyDrive/code/spam_classification')



# New Section