In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [8]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [9]:
!kaggle datasets download -d kamyarathod/spamham

Downloading spamham.zip to /content
100% 208k/208k [00:00<00:00, 370kB/s]
100% 208k/208k [00:00<00:00, 370kB/s]


In [10]:
import zipfile

In [11]:
zip = zipfile.ZipFile("/content/spamham.zip")
zip.extractall("/content/")

In [12]:
data = pd.read_csv("/content/spamham.csv")

In [13]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [14]:
data = data.where((pd.notnull(data)), "")

In [15]:
x = data["Message"]
y = data["Category"]

In [16]:
print(x.head())
print(y.head())

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object
0     ham
1     ham
2    spam
3     ham
4     ham
Name: Category, dtype: object


In [17]:
y = pd.Series(y)
y[y == "spam"] = 0
y[y == "ham"] = 1

In [18]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [19]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=3)

In [20]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [21]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words = "english", lowercase = True)
xtrain = feature_extraction.fit_transform(xtrain)
xtest = feature_extraction.transform(xtest)

ytrain.astype('int')
ytest.astype("int")

2632    0
454     1
983     0
1282    1
4610    1
       ..
4827    1
5291    1
3325    1
3561    1
1136    1
Name: Category, Length: 1115, dtype: int64

In [22]:
print(xtrain)

  (0, 5413)	0.6198254967574346
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.5092725360510079
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587516
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626947
  (4, 5497)	0.15743785051118359
  :	:
  (4454, 4602)	0.26697657324453916
  (4454, 3142)	0.3201445167776316
  (4455, 2247)	0.3705285186317046
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.3354567846463129
  (4455, 6810)	0.29731757715898277
  (44

In [23]:
model = LogisticRegression()

In [24]:
print(ytrain.shape)
print(xtrain.shape)
ytrain = np.array(ytrain)
ytrain = ytrain.astype('int')
ytrain.dtype

(4457,)
(4457, 7431)


dtype('int64')

In [25]:
model.fit(xtrain, ytrain)

## Prediction on Training Data

In [26]:
train_prediction = model.predict(xtrain)
train_prediction_accuracy = accuracy_score(ytrain, train_prediction)
print("Training Accuracy : ", train_prediction_accuracy)

Training Accuracy :  0.9670181736594121


## Prediction on Test data

In [27]:
ytest = np.array(ytest)
ytest = ytest.astype("int")
test_prediction = model.predict(xtest)
test_prediction_accuracy = accuracy_score(ytest, test_prediction)
print("Training Accuracy : ", test_prediction_accuracy)

Training Accuracy :  0.9659192825112107


In [40]:
data = ['''Subject: Exclusive Offer: Win a Luxury Vacation for FREE!

Dear Valued Customer,

Congratulations! You have been selected as one of our lucky winners to receive an all-expenses-paid luxury vacation package to an exotic destination of your choice! This once-in-a-lifetime opportunity is exclusively available to a limited number of recipients, and you are among the chosen few.

Imagine yourself sipping cocktails on pristine white sandy beaches, enjoying the crystal-clear turquoise waters, and indulging in world-class cuisine at a 5-star resort. This dream vacation could be yours without spending a single penny!

To claim your prize, simply click on the link below and fill out a short survey. Hurry, as this offer is time-sensitive, and seats are filling up fast. Don't miss out on your chance to experience the ultimate luxury getaway!

Click here to claim your FREE vacation: [maliciouslink.com]''']
data = feature_extraction.transform(data)
data.shape
pred = model.predict(data)

In [41]:
diction = {
    0:'Spam',
    1:'Not Spam'
}

In [42]:
print(pred)
for i in diction.keys():
  if i == pred[0]:
    print(diction[i])

[0]
Spam
