Importing Dependencies

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Data Collection & Pre-Processing

In [39]:
raw_df = pd.read_csv('Dataset/mail_data.csv')

In [40]:
raw_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [41]:
# Replacing the null values with a null string
df = raw_df.where(pd.notnull(raw_df), '')

In [42]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [43]:
# Checking number of rows and columns
df.shape

(5572, 2)

Label Encoding

In [44]:
# Convert 'Category' directly: ham → 1, spam → 0
df['Category'] = df['Category'].map({'ham': 1, 'spam': 0})

# Check the result
print(df.head())


   Category                                            Message
0         1  Go until jurong point, crazy.. Available only ...
1         1                      Ok lar... Joking wif u oni...
2         0  Free entry in 2 a wkly comp to win FA Cup fina...
3         1  U dun say so early hor... U c already then say...
4         1  Nah I don't think he goes to usf, he lives aro...


ham → 1, spam → 0

In [45]:
# Seperating the data as text and labels

X = df['Message']
y = df['Category']

Train Test Split

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, y, test_size=0.2, random_state=3
)

Feature Extraction

In [47]:
# Transform the text data to feature vectors
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df = 1, stop_words='english', lowercase=True)

# Fit on training data and transform both training and test data
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

# Convert Y_train and Y_test as integers

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [48]:
print(Y_train)

3075    1
1787    1
1614    1
4304    1
3266    0
       ..
789     0
968     1
1667    1
3321    1
1688    0
Name: Category, Length: 4457, dtype: int64


In [49]:
print(X_train_tf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 34775 stored elements and shape (4457, 7431)>
  Coords	Values
  (0, 2329)	0.38783870336935383
  (0, 3811)	0.34780165336891333
  (0, 2224)	0.413103377943378
  (0, 4456)	0.4168658090846482
  (0, 5413)	0.6198254967574347
  (1, 3811)	0.17419952275504033
  (1, 3046)	0.2503712792613518
  (1, 1991)	0.33036995955537024
  (1, 2956)	0.33036995955537024
  (1, 2758)	0.3226407885943799
  (1, 1839)	0.2784903590561455
  (1, 918)	0.22871581159877646
  (1, 2746)	0.3398297002864083
  (1, 2957)	0.3398297002864083
  (1, 3325)	0.31610586766078863
  (1, 3185)	0.29694482957694585
  (1, 4080)	0.18880584110891163
  (2, 6601)	0.6056811524587518
  (2, 2404)	0.45287711070606745
  (2, 3156)	0.4107239318312698
  (2, 407)	0.509272536051008
  (3, 7414)	0.8100020912469564
  (3, 2870)	0.5864269879324768
  (4, 2870)	0.41872147309323743
  (4, 487)	0.2899118421746198
  :	:
  (4454, 2855)	0.47210665083641806
  (4454, 2246)	0.47210665083641806
  (4455, 4456)	0.24

Training the model

Logistic Regression

In [50]:
model = LogisticRegression()

In [51]:
# Training the model on training data
model = model.fit(X_train_tf, Y_train)

Evaluating the model

In [54]:
Y_test_pred = model.predict(X_train_tf)

# Evaluate accuracy
accuracy = accuracy_score(Y_train, Y_test_pred)
print("Accuracy on training data:", accuracy)


Accuracy on training data: 0.9676912721561588


In [55]:
Y_pred = model.predict(X_test_tf)

# Evaluate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9668161434977578


In [None]:
def predict_message(message):
    """
    Predicts whether a message is 'spam' or 'ham'
    """
    # Transform the message using the same TF-IDF vectorizer
    message_tf = vectorizer.transform([message])
    
    # Make prediction
    prediction = model.predict(message_tf)[0]
    
    # Map numeric output to label
    return "ham" if prediction == 1 else "spam"

# Example usage:
new_msg = "Ok lar... Joking wif u oni..."
print("Prediction:", predict_message(new_msg))

new_msg2 = "WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
print("Prediction:", predict_message(new_msg2))

Prediction: ham
Prediction: ham
