In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
data = pd.read_csv('smsspamcollection/SMSSpamCollection', sep = '\t', names = ['label','message'])

# Exploratory Data Analysis

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.groupby('label').describe()

In [None]:
data['length'] = data['message'].apply(len)
data.head()

In [None]:
data['length'].plot.hist(bins = 40)

In [None]:
data.hist(column = 'length', by = 'label', bins = 40, figsize = (10,4))

## Pre Processing

So, length can be good feature

In [None]:
def text_PreProcess(message):
    # get rid of punctuations 
    no_punctuation = [ char for char in message if char not in string.punctuation]
    
    no_punctuation = ''.join(no_punctuation)
    
    return [word.lower() for word in no_punctuation.split() if word not in stopwords.words('english')]

In [None]:
data['message'].head().apply(text_PreProcess)

## Vectorization

In [None]:
bag_of_words = CountVectorizer(analyzer = text_PreProcess).fit(data['message'])

In [None]:
len(bag_of_words.vocabulary_)

In [None]:
sample = data['message'][0]
sample

In [None]:
sample_vector = bag_of_words.transform({sample})
print(sample_vector)
print(sample_vector.shape)

## Pipeline for Naive Baye's Classifier

In [None]:
pipeline = Pipeline([
    ('Bag of Words Transform', CountVectorizer(analyzer = text_PreProcess)),
    ('Naive Baye\'s Classifier', MultinomialNB())
])

## Train / Test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['message'],data['label'], test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
predictions = pipeline.predict(X_test)

In [None]:
print(confusion_matrix(predictions, y_test))
print('\n')
print(classification_report(predictions, y_test))