# Classificador de de possíveis golpes

In [19]:
import pandas as pd
import string
import re
import spacy
from unidecode import unidecode

### Função para pré-processamento:
- Transformam todos os caracteres em minusculas;
- Remove as acentuações
- Remove as pontuações
- Remove números

In [20]:
def cleaner(texto):
        texto = unidecode(texto)
        texto = texto.lower()
        texto = texto.translate(str.maketrans('','', string.punctuation))
        texto = re.sub(r'[0-9]+', '', texto)
        return texto

In [21]:
df = pd.read_csv('./spam.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Preparando a tabela para o treinamento:
- Com a função cleaner limpamos a tabela

In [22]:
df['v2'] = df['v2'].apply(cleaner)
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go until jurong point crazy available only in ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry in a wkly comp to win fa cup final...,,,
3,ham,u dun say so early hor u c already then say,,,
4,ham,nah i dont think he goes to usf he lives aroun...,,,


### Remoção das Stop Words
Stop Words são palavras que não agregam nenhum sentido na frase, como artigos e preposições

In [23]:
nlp = spacy.load("en_core_web_sm")
words = nlp.Defaults.stop_words

def stopWords(text):
    text = [word for word in text.split() if word not in list(words)]
    return ' '.join(text)
print(words)
df['v2'] = df['v2'].apply(stopWords)
df.head() 

{'wherein', 'six', 'side', 'hundred', 'how', 'herein', 'just', 'whatever', 'whither', 'seeming', 'though', 'via', 'were', 'off', 'same', 'on', 'but', 'that', 'two', 'nine', 'nothing', 'due', 'throughout', 'hence', 'at', 'than', "'s", 'hers', 'if', 'have', 'along', 'very', 'with', 'should', 'until', 'been', 'forty', 'part', '’d', 'about', 'any', 'again', 'you', 'becomes', 'has', 'and', 'whose', 'before', 'whereas', 'mine', 'me', 'quite', 'not', 'am', 'themselves', 'something', 'she', 'whoever', 'name', 'or', 'nobody', 'still', 'everywhere', 'full', '‘d', 'done', 'across', 'they', 'sixty', "n't", 'keep', 'for', '‘ll', 'put', "'ve", 'we', 'them', 'fifteen', 'whereby', 'first', 'namely', 'myself', 'further', 'yourselves', 'anyhow', 'an', 'what', 'except', 'thereupon', 'always', 'everything', 'amount', 'hereafter', '‘re', 'hereupon', 'whereafter', 'his', 'meanwhile', 'himself', 'however', 'all', 'less', 'regarding', 'when', 'sometimes', 'who', "'re", 'behind', 'besides', 'used', 'cannot', '

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,jurong point crazy available bugis n great wor...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts st ...,,,
3,ham,u dun early hor u c,,,
4,ham,nah dont think goes usf lives,,,


### Algoritmo de treinamento
Primeiro temos que fazer uma vetorização das palavras, por que o computador não entende a linguagem natural

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection, svm, naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report



In [25]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['v2'], df['v1'], test_size=0.3)

Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

### Contagem das palavras

In [26]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['v2'])

train_x_Tdfidf = Tfidf_vect.transform(train_x)
test_x_Tdfidf = Tfidf_vect.transform(test_x)
print(Tfidf_vect.vocabulary_)



In [27]:
target_names = {
    'ham', 'spam'
}

### Naive Bayes

In [28]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_x_Tdfidf, train_y)

predictions_NB = Naive.predict(test_x_Tdfidf)
print("Naive Bayes Accuracy Score -> ", accuracy_score(predictions_NB, test_y)*100)

Naive Bayes Accuracy Score ->  97.66746411483254


### Support Vector Machine (SVM)

In [29]:
SVM = svm.SVC(kernel='linear')
SVM.fit(train_x_Tdfidf, train_y)

predictions_SVM = SVM.predict(test_x_Tdfidf)
print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, test_y)*100)

SVM Accuracy Score ->  97.48803827751196
