In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
df = pd.read_csv('emails.csv')

In [6]:
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [7]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [8]:
df.duplicated().sum()

33

In [9]:
df.shape

(5728, 2)

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(5695, 2)

In [12]:
df.spam.value_counts() # spam = 1

spam
0    4327
1    1368
Name: count, dtype: int64

In [13]:
x = df.text
y = df.spam

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=3, test_size=0.2)

In [15]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(4556,) (1139,) (4556,) (1139,)


In [16]:
ft = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')

In [17]:
x_train_ft = ft.fit_transform(x_train)
x_test_ft = ft.transform(x_test)

In [18]:
model = LogisticRegression()

In [19]:
model.fit(x_train_ft,y_train)

In [20]:
pred = model.predict(x_train_ft)

In [21]:
accuracy_score(y_train, pred)

0.9964881474978051

In [22]:
test_pred = model.predict(x_test_ft)

In [23]:
accuracy_score(y_test,test_pred)

0.9824407374890255

In [24]:
input_mail = ["Subject: interview schedule for greg mikkelson  attached please find the interview packet for the above - referenced person .  the interview will happen tuesday , july 11 , 2000 . print all three documents  for your hard copies . if you have any questions , or conflicts of schedule ,  please do not hesitate to contact me .  liz alvarado  58983"]
input_mail_ft = ft.transform(input_mail)

In [25]:
input_mail_pred = model.predict(input_mail_ft)
input_mail_pred[0]

0

In [26]:
if input_mail_pred[0]==1:
    print("Spam Mail...")
else:
    print("Ham Mail...")

Ham Mail...


In [28]:
import pickle

# save the trained model and scaler as pickle file
with open('SpamMail_LogisticRegression_model.pkl', 'wb') as mod:
    pickle.dump(model, mod)

with open('Feature_TfidfVectorizer.pkl', 'wb') as vec:
    pickle.dump(ft, vec)

In [29]:
with open('SpamMail_LogisticRegression_model.pkl','rb') as m:
    ml = pickle.load(m)
with open('Feature_TfidfVectorizer.pkl','rb') as f:
    ftr = pickle.load(f)

In [36]:
input_mail = ["you have won lucky draw please click on this link..."]
input_mail_ft = ftr.transform(input_mail)

In [37]:
input_mail_pred = ml.predict(input_mail_ft)
input_mail_pred[0]

1

In [38]:
if input_mail_pred[0]==1:
    print("Spam Mail...")
else:
    print("Ham Mail...")

Spam Mail...


In [49]:
pickle.format_version

'4.0'

In [53]:
df.iloc[4]

text    Subject: the stock trading gunslinger  fanny i...
spam                                                    1
Name: 1, dtype: object

In [54]:
df.iloc[4]['text']

"Subject: do not have money , get software cds from here !  software compatibility . . . . ain ' t it great ?  grow old along with me the best is yet to be .  all tradgedies are finish ' d by death . all comedies are ended by marriage ."