In [None]:
import numpy as np
import pandas as pd

In [None]:
dataset = pd.read_csv("spam_ham_dataset.csv")

# Cleaning Data

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0,len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  all_stopwords.remove("isn't")
  all_stopwords.remove("didn't")
  all_stopwords.remove("wouldn't")
  my_set = set(all_stopwords)
  temp = []
  for s in review:
    if s not in my_set:
      temp.append(ps.stem(s))
    review = " ".join(temp)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 25000)
cv.fit(corpus)
X = cv.transform(corpus).toarray()
y = dataset.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size = 0.2, random_state = 0)

# Naive Bayes Claassifier

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = nb.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[718  14]
 [ 35 268]]


0.9526570048309179

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rm = RandomForestClassifier(n_estimators = 50, random_state = 0)
rm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = rm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[719  13]
 [ 16 287]]


0.9719806763285024

# SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel="linear",random_state = 0)
svm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = svm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[711  21]
 [ 16 287]]


0.9642512077294686

# Kernel SVM

In [None]:
from sklearn.svm import SVC
ksvm = SVC(kernel="rbf",random_state = 0)
ksvm.fit(X_train,y_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = ksvm.predict(X_test)
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[705  27]
 [  9 294]]


0.9652173913043478

# Testing model with new data

In [None]:
email1 = """Dear respected Sir/Ma'am,

May this letter find you in peace and the best of health.

I am Yakiv Vadaturskyi, 21 years old male from Bendzary, Ukraine. I am the adopted son of Oleksiy Vadaturskyi, a Ukraine Agricultural and Grain logistics businessman, the founder of Nibulon, the largest grain logistics company in Ukraine who lost his life along with my mother (His wife) Raisa when a missile hit our home overnight on the 31st of July 2022. I was saved by the Lord from this attack which took place while I was in my hostel at school.

This is the reason I am contacting you and you must assure me to keep this very confidential as I am still hiding from my late father's detractors, enemies and step brother who knew that our late father left a huge assets in my care.

Before the death of our father at Mechnikov Hospital following the bombing, our father silently revealed to me that he had the sum of Thirty - Nine Million United States Dollars deposited in a bank in Mexico which he planned to use for setting up factories for agricultural produce in Mexico. And knowing that my brother (His biological son, Andrii) might not let me, being our father's adopted son, have a piece of his estates, funds and other properties, our father advised me to travel to Mexico, meet with the attache Attorney to the German Embassy in Mexico who happened to be his friend from the University and collect from him all the related documents.

After arriving in Mexico few days ago, as it has always been on my mind to live in your country, I did some search online through search engine and came across your contact and decided to contact you and seek your honest assistance in moving this funds from the bank here in Mexico to your account or any bank account you may provide for safe transfer of the funds.

After the safe transfer of the funds to your bank account, I will come over to your country, meet with you to discuss how I can invest the funds until I am done with my studies/education which of course I will continue in your country. With your sincere guidance, I am sure this funds can be invested wisely for quick growth.

Because this funds was legally acquired by my late father, meant for good purpose and my future, I am not gonna promise to compensate you with any specific amount, but as the investment grows, and following any mutual agreement went into between you and I by the time the funds gets to your account and I come over to your country, you shall benefit alongside me.

Before closing this letter, may I inform you that you are one of three people I contacted for this purpose, so if you answer this letter before any of the other two does, I will inform him or her that I have already found someone.

Thanks and may God bless you!
Yakiv Vadaturskyi"""

# spam email

In [None]:
email2 = """Hi Kunal,

Internshala would like to invite you to join us for #KaamKiBaat with Vartika Dixit from Microsoft.

In this session, she will be sharing insights about human resources, what skills and training you need to become a professional in this domain, career development opportunities in human resources and more.

This session is completely FREE and will commence at 6 PM on 21st July 2023. All attendees will receive a participation certificate. Click on the button below to register."""

# ham email

In [None]:
email3 = """Hi Kunal,

Data Science Engineer at Microsoft is here to help you become a Data Scientist yourself!

Average Salary: ₹25 LPA
High in demand: 2,00,000+ current openings
Join his live classes now: Click Here

Pro Tip: People who fill out our I'm Interested form get an additional 10% off on the course

Enrol now for his crash course to placement classes and get ahead in the game!

Regards,
Team Unstop"""

# spam email

In [None]:
email4 = """Hello DEATHKUNAL1,
✉️ E-mail : deathkunal1@gmail.com

DEATHKUNAL1 You have (1) package pending in our warehouse.

Unfortunately, we could not deliver your postal parcel on time because your address is not correct.

Please reply to us with the correct delivery address. ✍️ here ✍️

Best regards,

UPS Rewards"""

#spam email

In [None]:
email5 = """

Faacebook



Someone logged into your facebook account on Sat, 24 Jun 2023 14:01:33 +0000 using Google Pixel 4a. we just wanted to make sure it was you! If you don't think this was you. please report this so we can keep your account safe."""

# spam email

In [None]:
email6 = """Hello Random Guy

As you embark upon this journey of self-improvement and learning, join over 10,000+ Indian developers in taking FREE Mock Coding Interview powered by Scaler for Software Professionals like yourself.

Stay one step ahead of your peers with our Holistic Performance Report, Structured Learning Path along with Learning Resources & Practice Problems!

Why should you take the assessment?

Question-wise Performance Report
Identify exact Topics to Improve
Get a Structured Learning Path
Free learning resources for DSA & System Design
Real-life Interview questions (from FAANG companies and BigTech Startups!)
A complementary career counselling session after the test.
Begin Mock Coding Interview

What are you waiting for? Take this FREE Mock Coding Interview and beat your competition!

Kickstart your learning journey NOW!"""

# ham email

In [None]:
email7 = """Hi Random_Guy_random,

This is a reminder to participate in the $70,000 Kaggle AI Report Essay Competition. Public notebooks are due on July 5, 2023.

This competition is very different from our usual ML competitions. As artificial intelligence rapidly develops, there's a growing demand for summaries of the current state of the art. With over a decade of experience verifying ML research claims & openly sharing their results, we believe the Kaggle community is the perfect group to take up this challenge.

In this essay competition, participants must write an in-depth, meta-analysis on one of 7 ML topics by July 5th. The best essays will win cash prizes and be published together in Kaggle’s official AI report.

Total Prizes:
$70,000


Entry Deadline:
July 5, 2023

Learn More
Good luck,

Paul Mooney
Kaggle Data Scientist"""

# ham email

In [None]:
s = []
word = re.sub('[^a-zA-Z]', ' ', email7)
word = word.lower()
word = word.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
all_stopwords.remove("isn't")
all_stopwords.remove("didn't")
all_stopwords.remove("wouldn't")
my_set = set(all_stopwords)
temp = []
for i in word:
  if i not in my_set:
    temp.append(ps.stem(i))
review = " ".join(temp)
s.append(review)

In [None]:
test = cv.transform(s).toarray()

In [None]:
res1 = rm.predict(test)
res2 = nb.predict(test)
res3 = svm.predict(test)
res4 = ksvm.predict(test)
print(res1,res2,res3,res4)
if res1[0] == 1:
  print('spam')
else:
  print('ham')

[0] [0] [1] [1]
ham
