In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
#le os dados do arquivo csv e eliminas as colunas "Unnamed: 2", "Unnamed: 3" e "Unnamed: 4"
df = pd.read_csv("data/spam.csv", encoding="ISO-8859-1")
df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True, axis=1)
df = df.rename(columns={"v1":"target", "v2":"message"})
df

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df.shape

(5572, 2)

In [4]:
#Cria uma função de pré-processamento de texto
def preprocess_text(text):
    
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    text = text.strip()
    text = text.split()
    text = ' '.join(list(filter(lambda x : x not in ['', ' '], text)))
    return text


In [5]:
#Aplica a função de pré-processamento a todas as mensagens
df.message = df.message.apply(preprocess_text)
df

Unnamed: 0,target,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that so any other suggest...
5570,ham,the guy did some bitching but i acted like i d...


In [6]:
#Dividindo os dados em dados de treino e dados de teste
X_train, X_test, y_train, y_test = train_test_split(df.message.values, df.target.values, test_size=0.33, stratify=df.target, random_state = 14)

In [7]:
#Cria uma bag of word usando todas as stop_words da linga inglesa
bow = CountVectorizer(stop_words='english')

In [8]:
bow.fit(X_train)

CountVectorizer(stop_words='english')

In [9]:
X_train = bow.transform(X_train)
X_test = bow.transform(X_test)

In [10]:
terms = bow.get_feature_names()
terms

['00',
 '000',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07008009200',
 '07046744435',
 '07090201529',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '0776xxxxxxx',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '07xxxxxxxxx',
 '08',
 '0800',
 '08000407165',
 '08000776320',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448714184',
 '0845',
 '08452810071',
 '08452810073',
 '0870',
 '08700435505150p',
 '08700469649',
 '08700621170150p',
 '08701417012',
 '08701417012150p',
 '0870141701216',
 '08701752560',
 '087018728737',
 '08702490080',
 '08702

In [11]:
#Cria o modelo naive bayes
naive_bayes = MultinomialNB()
#Treina o modelo
naive_bayes.fit(X_train, y_train)

MultinomialNB()

In [12]:
#Avalia o modelo
print(f'Accuracy : {accuracy_score(y_test, naive_bayes.predict(X_test)):.3f}')

Accuracy : 0.987


In [13]:
#insere um texto de exemplo para treinar se o modelo consegue o classificar
text = "You've Won! Winning an unexpected prize sounds great in theory. ..."
p_text = preprocess_text(text)
print(p_text)
p_text = bow.transform([p_text])
naive_bayes.predict(p_text)

you ve won winning an unexpected prize sounds great in theory


array(['spam'], dtype='<U4')