# **Imports & Downloads**

In [68]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# **Importing The Dataset**

In [69]:
dataset = pd.read_csv('spam.csv',encoding='latin-1')[['v1', 'v2']]
dataset.columns = ['Y', 'X']
dataset

Unnamed: 0,Y,X
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


# **Preprocess By Tokenizing**

In [77]:
#Test
text = "This is a sample text data for preprocessing. Hello World 103"

In [72]:
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
filtered_tokens = [token for token in tokens if token not in stop_words]
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print("Lemmatized tokens:", lemmatized_tokens)

Lemmatized tokens: ['This', 'sample', 'text', 'data', 'preprocessing', '.', 'Hello', 'World', '103']


In [73]:
def preprocess_text(textdata):
  stop_words = stopwords.words('english')
  processed_data = []

  for text in textdata:
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    processed_text = ' '.join(lemmatized_tokens)
    processed_data.append(processed_text)
  return processed_data

preprocessed_data = preprocess_text(dataset.X)
for i, text in enumerate(dataset.X):
  print(f"Preprocessed tokens {i+1} : {preprocessed_data[i]}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Preprocessed tokens 573 : Can open door ?
Preprocessed tokens 574 : Waiting call .
Preprocessed tokens 575 : Nope waiting sch 4 daddy ...
Preprocessed tokens 576 : You ? 1,000 cash ? 2,000 prize ! To claim , call09050000327
Preprocessed tokens 577 : I 'm tired arguing week week . Do want , 'll .
Preprocessed tokens 578 : ÌÏ wait 4 sch finish ard 5 ..
Preprocessed tokens 579 : mobile number å£5000 , claim call u back ring claim hot line 09050005321 .
Preprocessed tokens 580 : Arngd marriage u r walkin unfortuntly snake bite u. bt love marriage dancing frnt snake & amp ; sayin Bite , bite .
Preprocessed tokens 581 : Huh early .. Then Ì_ dinner outside izzit ?
Preprocessed tokens 582 : Ok anyway need change said
Preprocessed tokens 583 : We tried contact reply offer 750 min 150 textand new video phone call 08002988890 reply free delivery tomorrow
Preprocessed tokens 584 : ex-wife able kid . Do want kid one day ?
Preprocessed

# **Spilting Into Train & Test**

In [78]:
X_train, X_test, y_train, y_test = train_test_split(preprocessed_data, dataset.Y, test_size=0.2)

# **Text Classification Using Naive Bayes**

In [75]:
vectorizer = TfidfVectorizer(max_features=2000)
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)
clf = MultinomialNB()
clf.fit(X_train_features, y_train)

y_pred = clf.predict(X_test_features)

# **Evaluating The Performance Of The Model**

In [76]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred, average='weighted')
print(f"Recall: {recall:.4f}")

# F1-score
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9803
Precision: 0.9807
Recall: 0.9803
F1-score: 0.9796
