## Spam detection

We use the famous spam/ham data set of labeled text messages

In [None]:
!mkdir data
!wget -O data/sms.tsv https://raw.githubusercontent.com/MJafarMashhadi/MachineLearningWorkshop/master/data/sms.tsv

In [None]:
import re, string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer
import matplotlib.pyplot as plt 
%matplotlib inline

sms = pd.read_csv('data/sms.tsv', sep='\t', header=None, names=['label', 'text'])
sms['label'] = sms['label'].astype('category')
sms

In [None]:
sms.groupby('label').count()

In [None]:
print(('\n'*3).join(sms.loc[sms['label'] == 'spam'].loc[[11, 5, 123, 147, 5566]].text.tolist()))

Let's clean the texts up first:
* Make everything lowercase
* Remove punctuations
* Clean up white spaces
* Remove stop words **(\*)**
* Stem the words **(\*)**

In [None]:
stemmer = SnowballStemmer(language='english')

for w in ['cats', 'cat', 'graduation', 'behavioural']:
    print(f'{w:15} --stemmer--> {stemmer.stem(w)}')

In [None]:
ENGLISH_STOP_WORDS

In [None]:
def clean_text(text):
    """
    text: str, returns: str
    """
    text = text.lower()
    text = re.sub('\s+', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([stemmer.stem(w) for w in text.split() if w not in ENGLISH_STOP_WORDS])
    return text

sms['text'] = sms.text.apply(clean_text)

In [None]:
print(('\n'*3).join(sms.loc[sms['label'] == 'spam'].loc[[11, 5, 123, 147, 5566]].text.tolist()))

### Vectorization


In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


def build_tfidf(documents):
    vectorizer = TfidfVectorizer(min_df=2)  
    vectorizer.fit(documents)
    return vectorizer

vectorizer = build_tfidf(sms['text'])
print('Vocabulary size =', len(vectorizer.vocabulary_))

In [None]:
transformed = vectorizer.transform([clean_text("""FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv""")])[0]

In [None]:
transformed[transformed.nonzero()]

In [None]:
scores = {name: [] for name in ('Logistic Regression', 'Naïve Bayes')}

for split_seed in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        sms['text'],
        sms['label'],
        test_size=0.2,
        random_state=split_seed,
        stratify=sms['label']
    )

    vectorizer = build_tfidf(X_train)
    X_train = vectorizer.transform(X_train)
    X_test = vectorizer.transform(X_test)

    nb = BernoulliNB().fit(X_train, y_train)
    lr = LogisticRegression(solver='liblinear', random_state=0).fit(X_train, y_train)

    scores['Logistic Regression'].append(lr.score(X_test, y_test))
    scores['Naïve Bayes'].append(nb.score(X_test, y_test))

pd.DataFrame(scores).plot(kind='box');
plt.ylim([0.9,1])