# Aprendizagem de Máquina I

## Hugo Tremonte de Carvalho

#### hugo@dme.ufrj.br

"*The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.*"

https://www.kaggle.com/uciml/sms-spam-collection-dataset

https://en.wikipedia.org/wiki/Spam_(food)

Inspirado no seguinte _notebook_: https://www.kaggle.com/code/andreshg/nlp-glove-bert-tf-idf-lstm-explained

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics
from sklearn.decomposition import PCA
import numpy as np
import re
import string
import seaborn as sns
from scipy.spatial.distance import pdist

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk import SnowballStemmer

In [None]:
sms = pd.read_csv('spam.csv', encoding='ISO-8859-1')
# sms = pd.read_csv('spam.csv', encoding='latin-1')

## Análise exploratória e limpeza da base de dados

In [None]:
sms.head()

In [None]:
sms = sms[['v1', 'v2']].rename({'v1': 'class', 'v2': 'text'}, axis = 'columns')

In [None]:
sms.head()

In [None]:
print(sms['text'][2])

In [None]:
def clean_text(text):
    """Remove links, brackets, punctuation, digits-in-words, special chars.
    Keeps only lowercase a–z and spaces."""
    
    text = str(text).lower()

    # Remove conteúdo dentro de [], (), {}, <>
    text = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}|<.*?>', ' ', text)

    # Remove links
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

    # Remove palavras contendo números
    text = re.sub(r'\w*\d\w*', ' ', text)

    # Remove pontuação padrão
    text = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', text)

    # Remove caracteres não ASCII a–z, números e espaço
    # (aqui já removemos acentos e símbolos estranhos)
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Remove múltiplos espaços
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
sms['text_clean_aux_1'] = sms['text'].apply(clean_text)

sms.head()

In [None]:
print(sms['text'][2])
print(sms['text_clean_aux_1'][2])

In [None]:
stop_words = stopwords.words('english')
stop_words

In [None]:
def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
sms['text_clean_aux_2'] = sms['text_clean_aux_1'].apply(remove_stopwords)
sms.head()

In [None]:
print(sms['text'][2])
print(sms['text_clean_aux_1'][2])
print(sms['text_clean_aux_2'][2])

In [None]:
stemmer = nltk.SnowballStemmer('english')

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text

sms['text_clean'] = sms['text_clean_aux_2'].apply(stemm_text)
sms.head()

In [None]:
print(sms['text'][2])
print(sms['text_clean_aux_1'][2])
print(sms['text_clean_aux_2'][2])
print(sms['text_clean'][2])

In [None]:
sms['class'].value_counts()

In [None]:
sms['class'].value_counts().plot(kind = 'bar', figsize = (5, 5))
plt.ylabel('Contagem')
plt.show()

### Quais as palavras mais comuns de cada categoria?

#### Ham:


In [None]:
# somente o texto de tais mensagens

sms[sms['class'] == 'ham']['text_clean']

In [None]:
# considere ' ' como separador para juntar as strings desejadas

' '.join(sms[sms['class'] == 'ham']['text_clean'])

In [None]:
' '.join(sms[sms['class'] == 'ham']['text_clean']).split()

In [None]:
Counter(' '.join(sms[sms['class'] == 'ham']['text_clean']).split()).most_common(50)

In [None]:
count_ham = pd.DataFrame.from_dict(
    Counter(' '.join(sms[sms['class'] == 'ham']['text_clean']).split()).most_common(50))
count_ham.head()

In [None]:
count_ham.columns = ['words_in_ham', 'count']
count_ham.head()

#### Spam:

In [None]:
count_spam = pd.DataFrame.from_dict(
    Counter(' '.join(sms[sms['class'] == 'spam']['text_clean']).split()).most_common(50))
count_spam.head()

In [None]:
count_spam.columns = ['words_in_spam', 'count']
count_spam.head()

#### Visualizando de outra forma:

In [None]:
count_ham.plot(kind = 'bar', legend = False, figsize = (15, 5))
plt.xticks(np.arange(50), count_ham['words_in_ham'])
plt.title('Palavras mais frequentes em mensagens genuínas')
plt.xlabel('Palavras')
plt.ylabel('Contagem')
plt.show()

count_spam.plot(kind = 'bar', legend = False, color = 'orange', figsize = (15, 5))
plt.xticks(np.arange(50), count_spam['words_in_spam'])
plt.title('Palavras mais frequentes em mensagens de spam')
plt.xlabel('Palavras')
plt.ylabel('Contagem')
plt.show()

In [None]:
sms['text_clean']

In [None]:
X = feature_extraction.text.CountVectorizer(binary = True).fit_transform(sms['text_clean'])
np.shape(X)

In [None]:
sms['class'] = sms['class'].map({'spam': 1,'ham': 0})
sms.head()

5995 atributos criados, nesse caso, palavras presentes na SMS. Armazenadas em uma matriz esparsa para economizar memória!

Atributo $j$ (coluna) na linha $i$ é 1 se a palavra associada ao índice $j$ aparece na SMS de índice $i$ e 0 caso contrário.

In [None]:
# Conferindo os tamanhos...

print(np.shape(X))
print(np.shape(sms['class']))
print(np.shape(sms.index))

## Classificação!

### Dividir em conjunto de treinamento e teste

In [None]:
# Mas quero guardar os índices correspondentes para poder ler as respectivas SMS classificadas erroneamente!

X_train, X_test, y_train, y_test, idx_train, idx_test = model_selection.train_test_split(
    X, sms['class'], sms.index, test_size = 0.33, stratify = sms['class'])

print([np.shape(X_train), np.shape(X_test)])

## Classificando

Vamos relembrar o classificador de Bayes ingênuo...

**Teorema** (Classificador de Bayes): A função $g: \mathbb{R}^p \to \mathcal{C}$ que minimiza o risco

$$R(g) = \mathbb{E}[\mathbb{I}(Y \neq g(\mathbf{X}))] = \mathbb{P}(Y \neq g(\mathbf{X}))$$

é o *classificador de Bayes*, dado por

$$g(\mathbf{x}) = \mathop{\mathrm{argmax}}_{d \in \mathcal{C}} \mathbb{P}(Y = d | \mathbf{X} = \mathbf{x})$$


- Estimar $\mathbb{P}(Y = d | \mathbf{X} = \mathbf{x})$ para cada classe $d \in \mathcal{C}$


- Considerar o classificador $$g(\mathbf{x}) = \mathop{\mathrm{argmax}}_{d \in \mathcal{C}} \widehat{\mathbb{P}}(Y = d | \mathbf{X} = \mathbf{x})$$

- $\mathbf{X} \in \mathbb{R}^{5995}$: vetor (aleatório) de atributos


- Matriz $X$, contendo 5572 realizações do vetor aleatório $\mathbf{X}$


- Na matriz $X$, temos que:

$$
X_{ij} = 
\begin{cases}
1, & \text{se a palavra de índice $j$ aparece na SMS de índice $i$} \\
0, & \text{caso contrário}
\end{cases}
$$


- $\mathbf{X}$ é um vetor aleatório **discreto**

- Variável resposta $Y \in \{0, 1\}$


- $0 \iff$ mensagem genuína
- $1 \iff$ spam

- Como $\mathbf{X}$ é discreto, temos que:


$$\mathbb{P}(Y = 1 | \mathbf{X} = \mathbf{x}) = \frac{\mathbb{P}(\mathbf{X} = \mathbf{x} | Y = 1)\mathbb{P}(Y = 1)}{\sum_{c = 0}^{1} \mathbb{P}(\mathbf{X} = \mathbf{x} | Y = c)\mathbb{P}(Y = c)} \propto \mathbb{P}(\mathbf{X} = \mathbf{x} | Y = 1)\mathbb{P}(Y = 1)$$


$$\mathbb{P}(Y = 0 | \mathbf{X} = \mathbf{x}) = \frac{\mathbb{P}(\mathbf{X} = \mathbf{x} | Y = 0)\mathbb{P}(Y = 0)}{\sum_{c = 0}^{1} \mathbb{P}(\mathbf{X} = \mathbf{x} | Y = c)\mathbb{P}(Y = c)} \propto \mathbb{P}(\mathbf{X} = \mathbf{x} | Y = 0)\mathbb{P}(Y = 0)$$

- $\mathbb{P}(Y = d)$ estimado através da proporção amostral entre as classes


- Assumir algum modelo probabilístico em $\mathbf{X}$ para estimar $\mathbb{P}(\mathbf{X} = \mathbf{x} | Y = d)$, para $d \in \{0, 1\}$

- Hipótese ingênua: *condicionado à classe, as componentes de $\mathbf{X}$ são independentes*


- Na nossa linguagem: *sabendo qual classe de SMS estamos analisando (spam ou ham), a ocorrência ou não das palavras não se influenciam mutuamente*

- Matematicamente: *para toda classe $d \in \{0, 1\}$, fatoramos* $$\mathbb{P}(\mathbf{X} = \mathbf{x} | Y = d) = \mathbb{P}(X_1 = x_1, \dots, X_p = x_p | Y = d) = \prod_{j = 1}^{p} \mathbb{P}(X_j = x_j | Y = d)$$


- $X_j | Y = d \sim \text{Bern}(p_{dj})$

- $p_{dj}$ = denota a probabilidade da classe $d$ gerar a palavra $j$

*This event model is especially popular for classifying short texts. It has the benefit of explicitly modelling the absence of terms.*"

"*Despite the fact that the far-reaching independence assumptions are often inaccurate, the naive Bayes classifier has several properties that make it surprisingly useful in practice. In particular, the decoupling of the class conditional feature distributions means that each distribution can be independently estimated as a one-dimensional distribution. This helps alleviate problems stemming from the curse of dimensionality, such as the need for data sets that scale exponentially with the number of features.*"

Wikipedia (https://en.wikipedia.org/wiki/Naive_Bayes_classifier)

In [None]:
NB = naive_bayes.BernoulliNB()

In [None]:
NB.fit(X_train, y_train)

In [None]:
y_test_pred = NB.predict(X_test)
y_test_pred_proba = NB.predict_proba(X_test)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
# https://developers.google.com/machine-learning/crash-course/classification/accuracy

# print(1 - metrics.accuracy_score(y_test, y_test_pred))
print(NB.score(X_test, y_test))

## Analisando as métricas

In [None]:
# https://scikit-learn.org/stable/modules/classes.html#classification-metrics
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix

CM_NB = metrics.confusion_matrix(y_test, y_test_pred)

In [None]:
CM_NB

In [None]:
metrics.ConfusionMatrixDisplay.from_estimator(NB, X_test, y_test, display_labels = NB.classes_)
plt.show()

In [None]:
metrics.RocCurveDisplay.from_estimator(NB, X_test, y_test)
plt.show()

## Analisando os erros

No caso do spam, o que é pior? Falso negativo ou falso positivo?

*   Falso negativo: Spam na caixa de entrada, deleto.
*   Falso positivo: Não-spam na caixa de spam, provavelmente jamais será lido e pode ser algo importante!

Portanto, evitar falsos negativos pode ser mais interessante!

Vendo os falsos positivos

In [None]:
FP = np.where((y_test == 0) & (y_test_pred == 1))
FP

In [None]:
for idx in FP[0]:
  print('(Prob. de ser spam %f) %s' % (y_test_pred_proba[idx, 1], sms.iloc[idx_test[idx]]['text_clean']))

Vendo os falsos negativos

In [None]:
FN = np.where((y_test == 1) & (y_test_pred == 0))
FN

In [None]:
for idx in FN[0]:
  print('(Prob. de ser spam %f) %s' % (y_test_pred_proba[idx, 1], sms.iloc[idx_test[idx]]['text_clean']))

## Distâncias dois-a-dois

In [None]:
pdist(X, metric='euclidean')

In [None]:
def pairwise_l2_sparse_sample(X, sample_size):
    """
    Amostra `sample_size` linhas da matriz esparsa X e calcula
    as distâncias L2 entre todas as linhas amostradas.
    
    Retorna apenas o vetor condensado (mesmo formato do pdist).
    """

    rng = np.random.default_rng()

    # sorteia índices diferentes
    n = X.shape[0]
    idx = rng.choice(n, size=sample_size, replace=False)
    
    # garante formato eficiente
    Xs = X[idx].tocsr()
    
    dists = []
    for i in range(sample_size):
        xi = Xs[i]
        for j in range(i+1, sample_size):
            xj = Xs[j]
            diff = xi - xj
            dist = np.sqrt(diff.multiply(diff).sum())
            dists.append(dist)

    return np.array(dists), idx

In [None]:
d, sampled_idx = pairwise_l2_sparse_sample(X, sample_size = 500)

plt.hist(d)
plt.show()

## PCA

In [None]:
pca = PCA(n_components = 500)
X_pca = pca.fit_transform(X)

explained_variance = pca.explained_variance_ratio_

plt.figure(figsize=(8, 4))
plt.plot(np.cumsum(explained_variance)*100, marker='.')
plt.xlabel("Número de componentes")
plt.ylabel("Variância acumulada (%)")
plt.title("Variância explicada acumulada")
plt.grid(True)
plt.show()

In [None]:
df_2d = pd.DataFrame(X_pca[:, :2], columns=['PC1', 'PC2'])
df_2d['class'] = sms['class']

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_2d.sample(2000), x='PC1', y='PC2', hue='class', palette='tab10', legend='full', alpha=0.7)
plt.title("Scatterplot 2D dos dois primeiros componentes principais (amostra 2000)")
plt.show()