In [3]:
# importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load the dataset
df = pd.read_csv('spam_ham_dataset.csv')
df

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,ham,Subject: industrial worksheets for august 2000...,0


In [7]:
# create a function for cleaning the text

import re # Regularization function
import string
from nltk.stem import PorterStemmer # nltk means Natural Language tool kit
# there are multiple stemmers like LancasterStemmer, SnowballStemmer
# like stemmer there is one more technique called Lemmatization (it convert word into lemma words same like stem)
st = PorterStemmer()

def clean_text(text):
    text = text.lower() 
    text = re.sub(r'\d+','',text) # to remove digits from the text (r means raw string)
    text = text.translate(str.maketrans('','',string.punctuation))
    # to remove punctuations (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~) 
    # str.maketrans(x--> the thing want to replace,y--> the thing that replace, z--> the thing that need to be deleted)
    # # .translate applies the translation table created by maketrans() to string
    text = text.strip() # it remove extra white space in the text
    text = text.replace('subject','').strip()
    # used to convert each word into its stem word (the word orgined from that word | eg:caring --> care ,carefull --> care)
    text =' '.join(st.stem(word) for word in text.split())
    # ' '.join is used to convert list of words into string and separated by space
    return text

In [8]:
df['clean_text']=df['text'].apply(clean_text) # apply is used to apply function to each row
df

Unnamed: 0,label,text,label_num,clean_text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0,enron methanol meter thi is a follow up to the...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0,hpl nom for januari see attach file hplnol xl ...
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0,neon retreat ho ho ho we re around to that mos...
3,spam,"Subject: photoshop , windows , office . cheap ...",1,photoshop window offic cheap main trend abas d...
4,ham,Subject: re : indian springs\r\nthis deal is t...,0,re indian spring thi deal is to book the teco ...
...,...,...,...,...
5166,ham,Subject: put the 10 on the ft\r\nthe transport...,0,put the on the ft the transport volum decreas ...
5167,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0,and follow nom hpl can t take the extra mmcf d...
5168,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0,calpin daili ga nomin juli as i mention earlie...
5169,ham,Subject: industrial worksheets for august 2000...,0,industri worksheet for august activ attach are...


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer # Term frequency -inverse document frequency 

In [10]:
vec = TfidfVectorizer(stop_words='english') # stop words means an any are been our etc in english
tfidf = vec.fit_transform(df['clean_text'])

In [11]:
# pd.DataFrame(tfidf.toarray(),columns=vec.get_feature_names_out()) 
# this was used to show the dataframe of converted values (matrix)

In [13]:
# split the data
x = tfidf
y = df['label_num']

In [14]:
x


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 292336 stored elements and shape (5171, 37803)>

In [15]:
y

0       0
1       0
2       0
3       1
4       0
       ..
5166    0
5167    0
5168    0
5169    0
5170    1
Name: label_num, Length: 5171, dtype: int64

In [17]:
y.value_counts()

label_num
0    3672
1    1499
Name: count, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
# to make data balanced

from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
x_bal,y_bal = sm.fit_resample(x_train,y_train)



In [20]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x_bal,y_bal)

In [21]:
y_pred_train = nb.predict(x_bal)
y_pred_test = nb.predict(x_test)

In [22]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98       729
           1       0.93      0.98      0.96       306

    accuracy                           0.97      1035
   macro avg       0.96      0.98      0.97      1035
weighted avg       0.97      0.97      0.97      1035



In [23]:
def predict_email_nav(texts):
    cleaned = clean_text(texts)
    vectorized = vec.transform([cleaned])
    pred = nb.predict(vectorized)[0]

    if pred == 1:
        print('This seems like a Spam Email')
    elif pred == 0:
        print('This seems like a Genuine Email')

In [29]:
predict_email_nav('''Dear Candidate,

****This is applicable only for the exam registered candidates****

Type of exam will be available in the list: Click Here

You will have to appear at the allotted exam center and produce your Hall ticket and Government Photo Identification Card (Example: Driving License, Passport, PAN card, Voter ID, Aadhaar-ID with your Name, date of birth, photograph and signature) for verification and take the exam in person. You can find the final allotted exam center details in the hall ticket.

The hall ticket is yet to be released. We will notify the same through email and SMS.

Type of exam: Computer based exam (Please check in the above list corresponding to your course name)

The questions will be on the computer and the answers will have to be entered on the computer; type of questions may include multiple choice questions, fill in the blanks, essay-type answers, etc.

Type of exam: Paper and pen Exam (Please check in the above list corresponding to your course name)

The questions will be on the computer. You will have to write your answers on sheets of paper and submit the answer sheets. Papers will be sent to the faculty for evaluation.

On-Screen Calculator Demo Link:

Kindly use the below link to get an idea of how the On-screen calculator will work during the exam.

https://tcsion.com/OnlineAssessment/ScientificCalculator/Calculator.html

NOTE: Physical calculators are not allowed inside the exam hall.

Thank you!

-NPTEL Team

--
You received this message because you are subscribed to the Google Groups "Announcement list for Python for Data Science" group.
To unsubscribe from this group and stop receiving emails from it, send an email to noc25-cs104-announce+unsubscribe@nptel.iitm.ac.in.
To view this discussion visit https://groups.google.com/a/nptel.iitm.ac.in/d/msgid/noc25-cs104-announce/fL0VWIIUQleTh7OF06aNyA%40geopod-ismtpd-canary-0.
For more options, visit https://groups.google.com/a/nptel.iitm.ac.in/d/optout.''')

This seems like a Genuine Email


In [30]:
predict_email_nav('''Hi Muhammed thouyib!

The market is shifting, and emerging sectors are opening up faster than ever.

Upskilling gives you early access to these fields and often a head start on the most impactful roles. 👔

Explore700+ Short Courses at 80% Offand lead with confidence in fast-changing spaces. 🌟
SAVE 80%
Free Courses	Refer & Earn	Mail Us

 

Best Wishes from UniAthena
 



 	
Athena Global Education is an Ed-Tech company operating from Oxford Science Park, Oxford, UK and a subsidiary of Westford Education Group - A Forbes Award-winning Institution. Additionally, we operate through our Support Centres located in the USA, UAE, and India; in order to provide academic assistance to learners around the globe.
 
Athena Global Education © 2022

Copyright © 2025 Athena Global Education FZE, All rights reserved.
 
Disclaimer: This e-mail message is confidential and for use by the addressee only. If the message is received by anyone other than the addressee, please return the message to the sender and then delete the message from your computer. Thank you for your cooperation. Any views expressed in this message are those of the individual sender, except where the sender specifies and with authority, states them to be the views of Athena Global Education (AGE). AGE does not accept responsibility for changes made to this message after it was sent. Whilst all reasonable care has been taken to avoid the transmission of viruses, it is the responsibility of the recipient to ensure that the onward transmission, opening or use of this message and any attachments will not adversely affect its systems or data. No responsibility is accepted by Athena Global Education in this regard and the recipient should carry out such virus and other checks, as it considers appropriate.
If you want to unsubscribe, Click Here...
''')

This seems like a Spam Email
