# Detecting and Classifying Spam Emails

- The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

- The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.

Data Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset

# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [None]:
ds = pd.read_csv('emails.csv')

In [None]:
ds

# Visualising the data

In [None]:
sns.countplot(ds.spam)

In [None]:
ham = ds[ds.spam == 0]
spam = ds[ds.spam == 1]

In [None]:
print('Spam %:', round(len(spam)/len(ds) * 100,1))
print('ham %:', round(len(ham)/len(ds) * 100,1))

# Taking care of missing data

In [None]:
# We observe no missing data

sns.heatmap(ds.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

# Encoding categorical variables - count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
cv = vec.fit_transform(ds['text'])

In [None]:
print(vec.get_feature_names())

In [None]:
print(cv.toarray())  

In [None]:
cv.shape

# Splitting the dataset into the training set and test set

In [None]:
X = cv
y = ds['spam'].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Fitting the Naive Bayes classifier to the dataset

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
# Predicting the test set results

y_pred = nb.predict(X_test)

# Evaluating the model

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
print(classification_report(y_test, y_pred))