In [1]:
from collections import Counter
import pandas as pd
import re
import nltk
import numpy as np
from dfply import bind_rows
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv('spam.csv', 
                 header=0, 
                 encoding="latin-1", 
                 names=['class', 'text'], 
                 usecols=['class', 'text'])

## Funções para pré-processamento do texto

In [71]:
INITIAL_PRE_PROCESS = True

def text_lowercase(text):
    return text.lower()

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_punctuation(text):
    symbols_pattern = r'[!"#\$%&\'\(\)\*\+,\-\–\/:=\?@\[\\\]\^_`{\|}~º(...)]'
    text = re.sub(symbols_pattern, ' ', text)
    return text

def tokenize(text):
    return nltk.tokenize.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    result = [token for token in tokens if token not in stop_words]
    #return ' '.join(clean_text)
    return result

def stemmer(tokens):
    porter = PorterStemmer()
    return [porter.stem(word) for word in tokens]

def lemmatizer(tokens):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(word) for word in tokens]

def preprocess_text(text):
    if (INITIAL_PRE_PROCESS):
        tokens = tokenize(text)
        return tokens
    else:
        text = text_lowercase(text)
        text = remove_numbers(text)
        text = remove_punctuation(text)
        tokens = tokenize(text)
        #tokens = remove_stopwords(tokens)
        tokens = lemmatizer(tokens)
        return tokens

# Treinando o modelo Naive Bayes

In [72]:
ham_documents = df[df['class'] == 'ham'].copy()
spam_documents = df[df['class'] == 'spam'].copy()

ham_documents['out'] = 0
spam_documents['out'] = 1

ham_documents = ham_documents.drop(['class'], axis=1)
spam_documents = spam_documents.drop(['class'], axis=1)

In [73]:
def fit(ham_documents, spam_documents):
    total_documents = len(ham_documents) + len(spam_documents)
    log_prior_ham = np.log(len(ham_documents) / total_documents)
    log_prior_spam = np.log(len(spam_documents) / total_documents)

    big_document = { 'ham': [], 'spam': [] }

    big_document['ham'] = [word for document in ham_documents['text'] for word in preprocess_text(document)]
    big_document['spam'] = [word for document in spam_documents['text'] for word in preprocess_text(document)]

    vocabulary = set([word for document in [*ham_documents['text'], *spam_documents['text']] for word in preprocess_text(document)])

    ham_counter = Counter(big_document['ham'])
    spam_counter = Counter(big_document['spam'])

    ham_denominator = sum([amount for (_, amount) in ham_counter.items()]) + len(vocabulary)
    spam_denominator = sum([amount for (_, amount) in spam_counter.items()]) + len(vocabulary)

    log_likelihood_ham = { word: np.log((ham_counter[word] + 1)/ham_denominator) for word in vocabulary }
    log_likelihood_spam = { word: np.log((spam_counter[word] + 1)/spam_denominator) for word in vocabulary }
    
    return (log_prior_ham, log_prior_spam, log_likelihood_ham, log_likelihood_spam, vocabulary)

# Testando o modelo Naive Bayes

Será considerado o encoding: 0 = ham, 1 = spam

In [74]:
def predict(document, model): 
    log_prior_ham, log_prior_spam, log_likelihood_ham, log_likelihood_spam, vocabulary = model
    
    ham_probability = log_prior_ham
    spam_probability = log_prior_spam

    for word in preprocess_text(document):
        if word in vocabulary:
            ham_probability += log_likelihood_ham[word]
            spam_probability += log_likelihood_spam[word]


    return np.argmax([ham_probability, spam_probability])

In [75]:
def cross_validation(ham_documents, spam_documents, k=10):
    documents = ham_documents.append(spam_documents)
    documents = documents.sample(frac=1).reset_index(drop=True)

    cut_size = round(len(documents) / k)
    groups = [documents[index:index + cut_size] for index in range(0, len(documents), cut_size)]

    if (len(groups) > k):
        groups[-2] = groups[-2].append(groups[-1])

    groups = groups[:k]
    
    for test_group_index in range(k):
        train_documents = groups[:test_group_index] + groups[test_group_index+1:]
        
        flat_traindocs = pd.DataFrame()
        
        for document in train_documents:
            if flat_traindocs.empty:
                flat_traindocs = document
            else: 
                flat_traindocs = flat_traindocs.append(document)
        
        test_documents = groups[test_group_index]
        yield (flat_traindocs, test_documents)

In [76]:
def experiment(setup):
    train_dataset, test_dataset = setup
    
    ham_train = train_dataset[train_dataset['out'] == 0]
    spam_train = train_dataset[train_dataset['out'] == 1]

    ham_test = test_dataset[test_dataset['out'] == 0]
    spam_test = test_dataset[test_dataset['out'] == 1]

    naive_bayes_classifier = fit(ham_train, spam_train)
    
    result = [(predict(document, naive_bayes_classifier), expected_outcome) for _,(document, expected_outcome) in test_dataset.iterrows()]
    return result

In [77]:
def score(results):
    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    
    for outcome, expected_outcome in results:
        true_positives += outcome == 1 and expected_outcome == 1
        true_negatives += outcome == 0 and expected_outcome == 0
        false_positives += outcome == 1 and expected_outcome == 0
        false_negatives += outcome == 0 and expected_outcome == 1
        
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)
    f1 = 2 * precision * recall / (precision + recall)
    
    return (precision, recall, f1)

In [78]:
def metrics():
    K = 10
    setups = cross_validation(ham_documents, spam_documents, K)
    sum_precision = 0
    sum_recall = 0
    sum_f1 = 0
    
    for setup in setups:
        results = experiment(setup)
        precision, recall, f1 = score(results)
        sum_precision += precision
        sum_recall += recall
        sum_f1 += f1
        #print(precision, recall, f1)
        #print()
        
    print('average precision: %0.3f' % (sum_precision/K))
    print('average recall: %0.3f' % (sum_recall/K))
    print('average f1: %0.3f' %(sum_f1/K))

# Testes do modelo (inicial e melhorado)

Para um primeiro teste do modelo foi realizado o treinamento com um fluxo simples de pré-processamento, onde era realizada somente a tokenização das palavras. Para executar o modelo dessa forma foi criado o flag `INITIAL_PRE_PROCESS` que, quando receber o valor `True` realizará o fluxo simples citado anteriormente.

Caso essa flag receba o valor `False` será realizado todo o pipeline de pré-processamento sobre o texto com as seguintes etapas:

1. Conversão das palavras em minúsculo
2. Remoção de números
3. Remoção de pontuação
4. Tokenização
5. Remoção de stop-words
6. Lemmatização

Abaixo são realizadas demonstrações das duas execuções junto das métricas obtidas em sua execução:

## Modelo inicial - com pré-procesamento simples

In [63]:
INITIAL_PRE_PROCESS = True
metrics()

average precision: 0.972
average recall: 0.910
average f1: 0.940


## Modelo melhorado - com pré-processamento completo

In [82]:
INITIAL_PRE_PROCESS = False
metrics()

average precision: 0.957
average recall: 0.942
average f1: 0.949


# Conclusão

Como pode-se observar, o modelo já apresentou uma boa precisão para classificação dos documentos (aproximadamente 94,0%), porém, a mesma pode ser melhorada após a realização do pipeline completo de pré-processamento, o que melhorou sua precisão em aproximadamente 0,9%, atingindo 94,9%.