In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [3]:
stemmer = PorterStemmer()

In [4]:
stop_words = set(stopwords.words("english"))

In [5]:
df = pd.read_csv("01. Project Spam Detection - DATASET.csv",encoding='latin-1')[['v1','v2']]

In [6]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.tail()

Unnamed: 0,v1,v2
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [8]:
df.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [9]:
df.isna().sum()

v1    0
v2    0
dtype: int64

In [10]:
df.columns = ['lable',"message"]

In [11]:
df.columns

Index(['lable', 'message'], dtype='object')

In [12]:
df.head()

Unnamed: 0,lable,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df.tail()

Unnamed: 0,lable,message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [14]:
# 0 = ham
# 1 = spam
df['lable'] = df['lable'].map({"ham":0,"spam":1})

In [15]:
df.head()

Unnamed: 0,lable,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [16]:
def preprocess_text(text):
    text = re.sub(r"\W", " ", text) # remove special symbol
    text = text.lower() # all text to lower case
    words = text.split() # collect words in list
    words = [stemmer.stem(word) for word in words if word not in stop_words] # remove stopewords and stem words
    return " ".join(words)

In [17]:
df["cleaned_message"] = df["message"].apply(preprocess_text)

In [18]:
df.head()

Unnamed: 0,lable,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
from sklearn.model_selection import  train_test_split

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
from sklearn.metrics import accuracy_score, classification_report

In [23]:
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df["cleaned_message"]) # input data
y = df["lable"] # output data

In [24]:
X_train,X_test,y_train,y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

In [25]:
model = LogisticRegression()

In [26]:
model.fit(X_train,y_train) # trained Model we can predict test data

In [27]:
y_pred = model.predict(X_test)

In [28]:
print(f"Accuracy : {accuracy_score(y_test, y_pred) * 100:.2f}%")

Accuracy : 95.43%


In [29]:
print(f"Classification report :\n\n{classification_report(y_test,y_pred)}")

Classification report :

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.69      0.80       150

    accuracy                           0.95      1115
   macro avg       0.96      0.84      0.89      1115
weighted avg       0.95      0.95      0.95      1115



In [30]:
def predict_email(email_text):
    processed_data = preprocess_text(email_text)
    vectorized_text = vectorizer.transform([processed_data])
    prediction = model.predict(vectorized_text)
    return "Spam" if prediction[0]==1 else "Ham - Not Spam"
    

In [31]:
email = "Will ﾌ_ b going to esplanade fr home?"


In [32]:
print(f"Email : {email}\n\nPrediction : {predict_email(email)}")

Email : Will ﾌ_ b going to esplanade fr home?

Prediction : Ham - Not Spam
