In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
raw_data = pd.read_csv("emails.csv")

In [3]:
raw_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [4]:
mail_data = raw_data.where((pd.notnull(raw_data)),'')

In [5]:
mail_data.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [6]:
mail_data.shape

(5728, 2)

In [7]:
x = mail_data['text']
y = mail_data['spam']

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=3)

In [9]:
print(x_train.shape)
print(x_test.shape)

(4582,)
(1146,)


In [10]:
feature_extraction = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [11]:
print(x_train_features)

  (0, 1631)	0.11948158504424804
  (0, 2175)	0.1139366811829867
  (0, 2270)	0.09287045505205638
  (0, 7353)	0.0655788157361805
  (0, 15304)	0.0663995869621661
  (0, 28666)	0.06431166772110039
  (0, 28707)	0.06016733162188114
  (0, 12928)	0.06467963832229102
  (0, 21474)	0.05636789397124751
  (0, 21089)	0.03528233862778253
  (0, 33086)	0.043228163741262066
  (0, 14600)	0.044761414388067036
  (0, 31687)	0.05286353166660429
  (0, 8779)	0.15915971536413323
  (0, 8929)	0.2046990616246818
  (0, 31574)	0.08608147431672733
  (0, 29719)	0.09538081914466627
  (0, 15873)	0.06034727389686767
  (0, 24009)	0.059727569756054304
  (0, 25478)	0.100523433523679
  (0, 3685)	0.1139366811829867
  (0, 30510)	0.07784647543051056
  (0, 15183)	0.07546212040150725
  (0, 32265)	0.044560090228177425
  (0, 6690)	0.06742389347978599
  :	:
  (4581, 429)	0.07532196662652381
  (4581, 18624)	0.062223221403916826
  (4581, 32353)	0.10192506274469543
  (4581, 4987)	0.11815520710746134
  (4581, 16993)	0.10310140246650003
  

In [12]:
model = LogisticRegression()

In [13]:
model.fit(x_train_features,y_train)

In [14]:
prediction_training = model.predict(x_train_features)
training_accuracy = accuracy_score(y_train,prediction_training)
training_accuracy

0.9965080750763858

In [15]:
prediction_testing = model.predict(x_test_features)
testing_accuracy = accuracy_score(y_test,prediction_testing)
testing_accuracy

0.9834205933682374

In [19]:
input_mail = ["Thanks for your subscription to Ringtone UK your mobile will be charged Â£5/month Please confirm by replying YES or NO. If you reply NO you will not be charged"]
# covert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)

[0]


In [18]:
input_emails = input("Enter the email: ")  #input the email text that you want to check for
input_data_features = feature_extraction.transform([input_emails]) #transforming the input email into numerical representation
prediction= model.predict(input_data_features)
print (prediction)
if (prediction[0]==0):
  print('Ham mail')
else:
    print('Spam mail')

[1]
Spam mail
