In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Loading and Observing the Dataset

In [None]:
df = pd.read_csv('/kaggle/input/spam-filter/emails.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

No Missing Values


In [None]:
df.isna().sum()

**Exploratory Data Analysis**

In [None]:
df['spam'].value_counts()

In [None]:
sns.countplot(df['spam'])

Checking the Length of email and it's relation

In [None]:
from nltk import word_tokenize

Function that tokenizes each and every email into words and returns it's length

In [None]:
def count_words(text):
    words = word_tokenize(text)
    return len(words)

Applying the function to df['text'] and storing the count in another column

In [None]:
df['count']=df['text'].apply(count_words)

In [None]:
df['count']

In [None]:
df.groupby('spam')['count'].mean()

# Text Prepreocessing

**Function to Process the text data and 1. Remove Punctuation 2.Stop Words 3.Stemming**

In [None]:
import string
from nltk.corpus import stopwords


In [None]:
def process_text(text):
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    
    
    return ' '.join([word for word in no_punc.split() if word.lower() not in stopwords.words('english')])

In [None]:
df['text']=df['text'].apply(process_text)

In [None]:
df['text']

**After cleaning the text. We will now carry out the process of Stemming to reduce infected words to their root**

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
def stemming (text):
    return ''.join([stemmer.stem(word) for word in text])

In [None]:
df['text']=df['text'].apply(stemming)

In [None]:
df.head()

**Now we will use Count Vectorizer to convert string data into Bag of Words ie Known Vocabulary**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer= CountVectorizer()
message_bow = vectorizer.fit_transform(df['text'])

In [None]:
#print(vectorizer.get_feature_names())
#print(message_bow.toarray())

**Splitting the Data[](http://)**

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(message_bow,df['spam'],test_size=0.20)

# Creating the Model and it's Evaluation

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb= MultinomialNB()
nb.fit(X_train,y_train)
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(nb,X_test,y_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(nb,X_test,y_test)

In [None]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=5,shuffle=True)
print("Accuracy using Cross Validation is :",np.mean(cross_val_score(nb,message_bow,df['spam'],cv=kfold,scoring="accuracy"))*100," %")