In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv('spam.csv',encoding="ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
df = df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'])

In [4]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [6]:
df.dropna(inplace = True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 130.6+ KB


In [8]:
# df=df.drop(columns  = ['Unnamed: 0'])

In [9]:
df.rename(columns={'v2':'email','v1':'spam'},inplace = True)

In [10]:
df.head()

Unnamed: 0,spam,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
df['spam'] = df['spam'].apply(lambda x:1 if x=='spam' else 0)

In [12]:
df['email'] = df['email'].apply(lambda x: x.replace('\n',' '))

In [13]:
df.head()

Unnamed: 0,spam,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df['spam'].value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [15]:
f = open('english_stopwords.txt')

In [16]:
stopwords = f.read()

In [17]:
import nltk
from nltk.stem.porter import PorterStemmer
import string

In [18]:
def preprocessing(text):
    text = text.lower()
    text = text.split()
    y = []
    for word in text:
        if word.isalnum():
            y.append(word)
    text = y[:]
    y.clear()
    for word in text:
        if word not in stopwords and word not in string.punctuation:
            y.append(word)
    
    return y

In [19]:
ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [20]:
df['email'] = df['email'].apply(stem)

In [21]:
df['email']=df['email'].apply(preprocessing)

In [22]:
df['email'] = df['email'].apply(lambda x: " ".join(x))

In [23]:
df.head()

Unnamed: 0,spam,email
0,0,go until jurong avail onli in bugi n great wor...
1,0,ok joke wif u
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli u c alreadi then
4,0,nah i think he goe to he live around here though


In [24]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [25]:
cv = CountVectorizer(max_features=10000)
tfidf = TfidfVectorizer(max_features=3000)

In [26]:
# x = cv.fit_transform(df['email']).toarray()
x = tfidf.fit_transform(df['email']).toarray()

In [27]:
x.shape

(5572, 3000)

In [28]:
y = df['spam']

In [29]:
y.shape

(5572,)

In [30]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.svm import SVC

In [31]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=2)

In [32]:
mnb = MultinomialNB()
mnb.fit(xtrain,ytrain)
ypred = mnb.predict(xtest)
print(accuracy_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))
print(precision_score(ytest,ypred))

0.9479820627802691
[[956   1]
 [ 57 101]]
0.9901960784313726


In [33]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)
print(accuracy_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))
print(precision_score(ytest,ypred))

0.9614349775784753
[[957   0]
 [ 43 115]]
1.0


In [126]:
bnb = BernoulliNB()
bnb.fit(xtrain,ytrain)
ypred = bnb.predict(xtest)
print(accuracy_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))
print(precision_score(ytest,ypred))

0.9668161434977578
[[949   8]
 [ 29 129]]
0.9416058394160584


In [35]:
et = ExtraTreesClassifier()
et.fit(xtrain,ytrain)
ypred = et.predict(xtest)
print(accuracy_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))
print(precision_score(ytest,ypred))

In [112]:
svc = SVC(kernel = 'sigmoid',gamma =1.0)
svc.fit(xtrain,ytrain)
ypred = svc.predict(xtest)
print(accuracy_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))
print(precision_score(ytest,ypred))

0.9632286995515695
[[952   5]
 [ 36 122]]
0.9606299212598425


In [36]:
import pickle
pickle.dump(rf,open('model.pkl','wb'))
pickle.dump(tfidf,open('vectorizer.pkl','wb'))

In [37]:
import sklearn


In [39]:
sklearn_version

NameError: name 'sklearn_version' is not defined