In [27]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

DATASET

In [28]:
df = pd.read_csv("/content/sample_data/spam_ham_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


PROCESSING DATASET

In [29]:
df = df[['text', 'label']]

df = df.rename(columns={'text': 'messages', 'label': 'label'})
df.head()


Unnamed: 0,messages,label
0,Subject: enron methanol ; meter # : 988291\r\n...,ham
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham
3,"Subject: photoshop , windows , office . cheap ...",spam
4,Subject: re : indian springs\r\nthis deal is t...,ham


In [30]:

df.shape

(5171, 2)

In [31]:
df.columns

Index(['messages', 'label'], dtype='object')

In [6]:
df.drop_duplicates(inplace=True)

In [32]:
df.shape

(5171, 2)

In [33]:
df.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       messages  label
0        False  False
1        False  False
2        False  False
3        False  False
4        False  False
...        ...    ...
5166     False  False
5167     False  False
5168     False  False
5169     False  False
5170     False  False

[5171 rows x 2 columns]>

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove stopwords
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [36]:
df['clean_text'] = df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,label,clean_text
0,Subject: enron methanol ; meter # : 988291\r\n...,ham,subject enron methanol meter 988291 follow not...
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham,subject hpl nom january 9 2001 see attached fi...
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham,subject neon retreat ho ho ho around wonderful...
3,"Subject: photoshop , windows , office . cheap ...",spam,subject photoshop windows office cheap main tr...
4,Subject: re : indian springs\r\nthis deal is t...,ham,subject indian springs deal book teco pvr reve...


TRAINING MODEL


In [37]:

X = df['clean_text']
y = df['label']

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=3)


In [39]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)


(5171,)
(4136,)
(1035,)


In [40]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

#Y_train = Y_train.astype('int')
#Y_test = Y_test.astype('int')

In [41]:
print(X_train)

2209    subject hplc wellhead daren list deals need mo...
2000    subject mobil chemical hpl meter 1256 expense ...
5030    subject revised nom 5 5 eastrans revised nom 5...
1376    subject exxon company usa global 96035668 sita...
1564    subject pharmacy nx want cheap pain killers ht...
                              ...                        
789     subject incr ease yo ur man hood 4 5 inch es g...
968     subject subscribers receive first notice run a...
1667    subject neon march 28 neon lesson march 28 th ...
3321    subject first delivery pure resources l p vanc...
1688    subject enhance chest size email loading image...
Name: clean_text, Length: 4136, dtype: object


In [42]:
print(X_train_features)

  (0, 3871)	0.13387711316973605
  (0, 531)	0.14556222812251965
  (0, 30432)	0.08468916670398006
  (0, 43249)	0.14556222812251965
  (0, 3890)	0.14556222812251965
  (0, 548)	0.14556222812251965
  (0, 37242)	0.11275796314501375
  (0, 2908)	0.11535664415295803
  (0, 456)	0.14556222812251965
  (0, 26281)	0.09506000151609588
  (0, 36171)	0.11400727959297849
  (0, 2478)	0.13872687405852518
  (0, 521)	0.14556222812251965
  (0, 16799)	0.11843023142166303
  (0, 22028)	0.13387711316973605
  (0, 2706)	0.14556222812251965
  (0, 522)	0.14556222812251965
  (0, 32041)	0.07311834410351342
  (0, 19402)	0.04211028825505044
  (0, 2537)	0.13872687405852518
  (0, 517)	0.14556222812251965
  (0, 19420)	0.14556222812251965
  (0, 16628)	0.24438399643390496
  (0, 3875)	0.14556222812251965
  (0, 836)	0.14556222812251965
  :	:
  (4135, 8871)	0.07154271542163933
  (4135, 16152)	0.07327358549803296
  (4135, 26919)	0.07154271542163933
  (4135, 15682)	0.07812310961344454
  (4135, 14164)	0.07812310961344454
  (4135, 11

In [43]:
model = LogisticRegression()

In [44]:
model.fit(X_train_features, Y_train)

In [45]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [46]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9970986460348162


In [47]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [25]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.987987987987988


PREDICTIVE SYSTEM

In [26]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')


['spam']
Spam mail
