In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1', 'v2']].rename({'v1': 'label', 'v2': 'mensagem'}, axis=1)
df.head()

Unnamed: 0,label,mensagem
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
x = TfidfVectorizer().fit_transform(df.mensagem).todense()
y = df.label.values

In [4]:
x.shape

(5572, 8672)

In [5]:
cross_val_score(X=x, y=y, cv=10, estimator=GaussianNB()).mean()

0.899319963731337

In [6]:
xt = PCA(n_components=50).fit_transform(x)

In [7]:
cross_val_score(X=xt, y=y, cv=10, estimator=GaussianNB()).mean()

0.8442234247581955

In [8]:
cross_val_score(X=xt, y=y, cv=10, estimator=MLPClassifier(hidden_layer_sizes=(200, 100, 30))).mean()

0.9815157661326778

In [9]:
xtrain, xtest, ytrain, ytest = train_test_split(xt, y)
m = confusion_matrix(y_pred=MLPClassifier(hidden_layer_sizes=(200, 100, 30)).fit(xtrain, ytrain).predict(xtest),
                     y_true=ytest)
m_df = pd.DataFrame(m, index=['Mensagem real', 'Spam real'], columns=['Predito mensagem', 'Predito spam'])
m_df

Unnamed: 0,Predito mensagem,Predito spam
Mensagem real,1207,6
Spam real,17,163


In [10]:
m_df.values.diagonal().sum() / m_df.sum().sum()

0.9834888729361091

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y)
m = confusion_matrix(y_pred=MLPClassifier(hidden_layer_sizes=(200, 100, 30)).fit(xtrain, ytrain).predict(xtest),
                     y_true=ytest)
m_df = pd.DataFrame(m, index=['Mensagem real', 'Spam real'], columns=['Predito mensagem', 'Predito spam'])
m_df

Unnamed: 0,Predito mensagem,Predito spam
Mensagem real,1183,5
Spam real,16,189


In [12]:
m_df.values.diagonal().sum() / m_df.sum().sum()

0.9849246231155779