In [None]:
from ast import increment_lineno
import numpy as np
import pandas as pd
dataset = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'message'])
#dataset
#dataset.info()
#dataset.describe
dataset['label'] = dataset['label'].map({'ham': 0, 'spam': 1})
#dataset
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(8,8))
g = sns.countplot(x="label", data=dataset)
p = plt.title('Countplot for Spam vs Ham as imbalanced dataset')
p = plt.xlabel('Is the SMS Spam?')
p = plt.ylabel('Count')
# handaling imbalanced dataset using Oversampling
only_spam = dataset[dataset['label'] == 1]
only_ham = dataset[dataset['label'] == 0]
print("Number of Spam SMS: ",len(only_spam))
print("Number of Ham SMS: ",len(only_ham))
count = int((dataset.shape[0] - only_spam.shape[0])/only_spam.shape[0])
print("Number of SMS to be added: ",count)
for i in range(count-1):
  dataset = pd.concat([dataset, only_spam])
dataset
plt.figure(figsize=(8,8))
g = sns.countplot(x="label", data=dataset)
p = plt.title('Countplot for Spam vs Ham as balanced dataset')
p = plt.xlabel('Is the SMS Spam?')
p = plt.ylabel('Count')
# reating new feature word_count
dataset['word_count'] = dataset['message'].apply(lambda x: len(x.split()))
dataset
plt.figure(figsize=(12,6))
#(1,1)
plt.subplot(1,2,1)
g = sns.histplot(dataset[dataset["label"] == 0].word_count, kde=True)
p = plt.title('Distribution of Word Count for Ham SMS')
#(1,2)
plt.subplot(1,2,2)
g = sns.histplot(dataset[dataset["label"] == 1].word_count, color="red", kde=True)
p = plt.title('Distribution of Word Count for Spam SMS')

plt.tight_layout()
plt.show()

#Creating new feature of containing currency symbols
def currency_present(dataset):
  currency_symbols = ['€', '$', '¥', '£', '₹']
  for symbol in currency_symbols:
    if symbol in dataset:
      return 1
  return 0
dataset['contains_currency_symbols'] = dataset['message'].apply(currency_present)
dataset

#countplt for contains_currency_symbols
plt.figure(figsize=(8,8))
g = sns.countplot(x="contains_currency_symbols", data=dataset, hue="label")
p = plt.title('Countplot for contains_currency_symbols')
p = plt.xlabel('Does SMS contains_currency_symbols')
p = plt.ylabel('Count')
p = plt.legend(['Ham', 'Spam'], loc=9)

#Creating new feature of containing numbers
def number(data):
  for i in data:
    if ord(i) >= 48 and ord(i) <= 57:
      return 1
  return 0
dataset['contains_numbers'] = dataset['message'].apply(number)
dataset

# Countplot for containing numbers
plt.figure(figsize=(8,8))
g = sns.countplot(x="contains_numbers", data=dataset, hue="label")
p = plt.title('Countplot for containing numbers')
p = plt.xlabel('Does SMS contains_numbers')
p = plt.ylabel('Count')
p = plt.legend(['Ham', 'Spam'], loc=9)

# Data Cleaning
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

corpus = []
wnl = WordNetLemmatizer()

for sms in list(dataset.message):
  message = re.sub(pattern = '[^a-zA]', repl = ' ', string = sms) # Filtering out special characters and numbers
  message = message.lower()
  words = message.split() # Tokenizer
  filtered_words = [word for word in words if word not in stopwords.words('english')]
  lemmatized_words = [wnl.lemmatize(word) for word in filtered_words]
  corpus.append(' '.join(lemmatized_words))

#corpus
# Creating the Bag of words model
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=500)
vectors = tfidf.fit_transform(corpus).toarray()
feature_names = tfidf.get_feature_names_out()

X  = pd.DataFrame(vectors, columns=feature_names)
y = dataset['label']

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test

# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
cv = cross_val_score(mnb, X, y, scoring = 'f1', cv = 10)
print(round(cv.mean(), 3))
print(round(cv.std(),3))

mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

plt.figure(figsize=(8,8))
axis_labels = ['Ham', 'Spam']
g = sns.heatmap(data=cm, xticklabels=axis_labels, yticklabels=axis_labels, annot = True, fmt='g', cbar_kws={"shrink": 0.5}, cmap="Blues")
p = plt.title('Confusion Matrix of Multinomila Naive Bayes Model')
p = plt.xlabel('Actual Values')
p = plt.ylabel('Predicted values')


# Now using decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
cv1 = cross_val_score(dt, X, y, scoring = 'f1', cv = 10)
print(round(cv1.mean(), 3))
print(round(cv1.std(),3))

dt.fit(X_train, y_train)
y_pred1 = dt.predict(X_test)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred1)
print(cm)

plt.figure(figsize=(8,8))
axis_labels = ['Ham', 'Spam']
g = sns.heatmap(data=cm, xticklabels=axis_labels, yticklabels=axis_labels, annot = True, fmt='g', cbar_kws={"shrink": 0.5}, cmap="Blues")
p = plt.title('Confusion Matrix of Decision Tree Model')
p = plt.xlabel('Actual Values')
p = plt.ylabel('Predicted values')

def predict_spam(sms):
  message = re.sub(pattern = '[^a-zA]', repl = ' ', string = sms) # Filtering out special characters and numbers
  message = message.lower()
  words = message.split() # Tokenizer
  filtered_words = [word for word in words if word not in stopwords.words('english')]
  lemmatized_words = [wnl.lemmatize(word) for word in filtered_words]
  message = ' '.join(lemmatized_words)
  temp = tfidf.transform([message]).toarray()
  return dt.predict(temp)

# prediction 1 - Lottery text message
sample_message = "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
if predict_spam(sample_message):
  print("This is a Spam message.")
else:
  print("This is a Ham(normal) message")
