In [13]:

#Import libraries and load the data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

file_path = ('/content/spam_dataset.csv')
df = pd.read_csv(file_path)

df

Unnamed: 0,message_content,is_spam
0,"Hello Lonnie,\n\nJust wanted to touch base reg...",0
1,"Congratulations, you've won a prize! Call us n...",1
2,You have been pre-approved for a credit card w...,1
3,"Limited time offer, act now! Only a few spots ...",1
4,Your loan has been approved! Transfer funds to...,1
...,...,...
995,"Hello Virginia,\n\nIt was great to catch up wi...",0
996,Final notice: Claim your inheritance from a di...,1
997,Hot singles in your area want to chat! Contact...,1
998,Your loan has been approved! Transfer funds to...,1


In [14]:
df.head()

Unnamed: 0,message_content,is_spam
0,"Hello Lonnie,\n\nJust wanted to touch base reg...",0
1,"Congratulations, you've won a prize! Call us n...",1
2,You have been pre-approved for a credit card w...,1
3,"Limited time offer, act now! Only a few spots ...",1
4,Your loan has been approved! Transfer funds to...,1



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [15]:
#Data cleaning and preprocessing
# Replace missing values with empty strings

data = df.where(pd.notnull(df), '')

# Show info and shape of data
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   message_content  1000 non-null   object
 1   is_spam          1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


(1000, 2)

In [17]:
#Label encoding target variable
data.loc[data['is_spam'] == 1, 'is_spam'] = 1
data.loc[data['is_spam'] == 0, 'is_spam'] = 0

# Define features and labels
X = data['message_content']
Y = data['is_spam'].astype('int')

print(X.head())
print(Y.head())

0    Hello Lonnie,\n\nJust wanted to touch base reg...
1    Congratulations, you've won a prize! Call us n...
2    You have been pre-approved for a credit card w...
3    Limited time offer, act now! Only a few spots ...
4    Your loan has been approved! Transfer funds to...
Name: message_content, dtype: object
0    0
1    1
2    1
3    1
4    1
Name: is_spam, dtype: int64


In [19]:
'''Split data into train and test sets'''
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=3)

X.shape,X_train.shape,X_test.shape

Y.shape, Y_train.shape, Y_test.shape

((1000,), (800,), (200,))

In [21]:
'''Feature extraction using TF-IDF'''
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

print(X_train_features)
print(X_test_features)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 22932 stored elements and shape (800, 1505)>
  Coords	Values
  (0, 297)	0.3341874031041129
  (0, 1428)	0.2565367463396501
  (0, 1483)	0.3314937874710904
  (0, 1064)	0.3369789896104547
  (0, 266)	0.21574701427971876
  (0, 396)	0.20535077276842836
  (0, 890)	0.2023401842053263
  (0, 1409)	0.2744166120319806
  (0, 239)	0.26607786005644424
  (0, 1476)	0.24628685409828857
  (0, 155)	0.26276395685550763
  (0, 272)	0.1948454387977373
  (0, 989)	0.2744166120319806
  (0, 369)	0.11096335647327646
  (0, 1436)	0.1291085989859388
  (0, 1457)	0.13083361919513375
  (0, 304)	0.13083361919513375
  (0, 382)	0.13083361919513375
  (1, 266)	0.09175251231987124
  (1, 601)	0.11129122240224677
  (1, 69)	0.2282731560485053
  (1, 296)	0.13675905393326826
  (1, 119)	0.1531065778359893
  (1, 864)	0.10293642324941185
  (1, 1458)	0.12452739979859606
  :	:
  (799, 58)	0.16836929331446007
  (799, 488)	0.12094963509791047
  (799, 1264)	0.11926718263294021
 

In [22]:
'''Model training with Logistic Regression'''

model = LogisticRegression()
model.fit(X_train_features, Y_train)


In [23]:
'''Evaluate model on train and test data'''

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)
print("Accuracy on training data:", accuracy_on_training_data)

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)
print("Accuracy on test data:", accuracy_on_test_data)


Accuracy on training data: 1.0
Accuracy on test data: 1.0


In [28]:
'''Test model on your own email text'''

input_email = ["Hello Lonnie, Just wanted to touch base regarding our project’s next steps. Please find the details below. Information material statement power suddenly summer usually. Serious responsibility before new success each. Kind regards, Terry Griffin Please find attached the updated project plan. Let me know if you have any questions. If you have any questions, please feel free to reach out."]


input_data_features = feature_extraction.transform(input_email)
prediction = model.predict(input_data_features)

if prediction[0] == 0:
    print("Ham mail")
else:
    print("Spam mail")


Ham mail
