In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #Used to extract text and convert it into binary formate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
data = pd.read_csv('mail_data.csv')

In [10]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [12]:
print()




In [13]:
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

In [14]:
x = data['Message']
y = data['Category']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=100, test_size=0.2)

In [16]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')


# min_df=1: Ignore words that appear in fewer than 1 document (essentially keeps all words).
# stop_words='english': Removes common English stop words (e.g., "the", "and", "is").
# lowercase=True: Converts all text to lowercase to ensure case consistency.

# fit_transform(x_train): Learns vocabulary from x_train and converts the text into a TF-IDF matrix.

In [17]:
logRegModel = LogisticRegression()

In [18]:
logRegModel.fit(x_train_features,y_train)

In [20]:
y_train_pred = logRegModel.predict(x_train_features)
y_test_pred = logRegModel.predict(x_test_features)

In [21]:
train_acc = accuracy_score(y_train_pred,y_train)
test_acc = accuracy_score(y_test_pred,y_test)
print(train_acc)
print(test_acc)


0.9676912721561588
0.9641255605381166


In [29]:
#User input scenario
mails = [" From , 6days, 16+ TsandCs apply Reply HL 4 info"]
# n = int(input("Enter the number of mails"))
# for i in range(n):
#     mails.append(input(f"Enter mail number {i+1}"))

mails_features = feature_extraction.transform(mails)
res = logRegModel.predict(mails_features)
print(res)
for i in res:
    print(mails[i-1])
    if(i==1):
        print("Ham")
    else:
        print("Spam")
    

[1]
 From , 6days, 16+ TsandCs apply Reply HL 4 info
Ham
