In [None]:
# Importar bibliotecas necessárias
import pandas as pd
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Carregar o arquivo usando o método do professor para arquivos desorganizados
from google.colab import files

# Remove todos os arquivos temporários enviados anteriormente
!rm -rf /content/*

# Fazer upload do arquivo
uploaded = files.upload()

In [None]:
# Lendo o arquivo SMSSpamCollection com tabulação como separador
df = pd.read_csv(io.BytesIO(uploaded['SMSSpamCollection']), sep='\t', header=None, names=['target', 'text'])

# Verificar a leitura correta
print(df.head())

In [None]:
# Mapeando a variável target
df['target'] = df['target'].map({'ham': 0, 'spam': 1})

# Separando X (textos) e y (target)
X = df['text']
y = df['target']

# Verificando o balanceamento percentual da base de dados
class_counts = y.value_counts(normalize=True) * 100
print("Distribuição das classes (em %):")
print(class_counts)

# Dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Aplicando TfidfVectorizer apenas no conjunto de treino
tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)

# Transformando o conjunto de teste com o vocabulário do treino
X_test_tfidf = tfidf.transform(X_test)

In [None]:
# Treinando o modelo de Regressão Logística
model_lr = LogisticRegression()
model_lr.fit(X_train_tfidf, y_train)

# Previsão e probabilidade
y_pred_lr = model_lr.predict(X_test_tfidf)
y_pred_prob_lr = model_lr.predict_proba(X_test_tfidf)[:, 1]

# Cálculo das métricas
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_pred_prob_lr)
pr_auc_lr = average_precision_score(y_test, y_pred_prob_lr)

print("Logistic Regression Metrics:")
print(f"Acurácia: {accuracy_lr:.4f}")
print(f"Precisão: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print(f"AUC-ROC: {roc_auc_lr:.4f}")
print(f"AUC-PR: {pr_auc_lr:.4f}")

# Histogramas
thresholds = [0.05, 0.75]
bins = np.linspace(0, 1, 11)
counts_general, _ = np.histogram(y_pred_prob_lr, bins=bins)
counts_spam, _ = np.histogram(y_pred_prob_lr[y_test == 1], bins=bins)
percentages_general = counts_general / counts_general.sum() * 100
percentages_spam = counts_spam / counts_spam.sum() * 100
bar_width = 0.4
bar_positions = np.arange(len(bins) - 1)

plt.figure(figsize=(14, 6))

# Histograma de Probabilidades
plt.bar(bar_positions - bar_width/2, percentages_general, width=bar_width, color='blue', label='População Geral', alpha=0.7, edgecolor='black')
plt.bar(bar_positions + bar_width/2, percentages_spam, width=bar_width, color='red', label='Spam', alpha=0.7, edgecolor='black')
plt.axvline(thresholds[0] * (len(bins) - 1), color='green', linestyle='--', label=f'< {int(thresholds[0] * 100)}% Spam')
plt.axvline(thresholds[1] * (len(bins) - 1), color='green', linestyle='--', label=f'> {int(thresholds[1] * 100)}% Spam')
plt.title('Logistic Regression - Distribuição das Probabilidades Preditas')
plt.xlabel('Probabilidade de ser spam')
plt.ylabel('Percentual da População')
plt.xticks(bar_positions, labels=[f'{int(b * 100)}%' for b in bins[:-1]])
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

# Curvas ROC e PR
plt.figure(figsize=(14, 6))

# Curva ROC
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_lr)
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc_lr:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Curva ROC')
plt.legend()

# Curva PR
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_lr)
plt.subplot(1, 2, 2)
plt.plot(recall, precision, color='blue', label=f'Logistic Regression (AUC = {pr_auc_lr:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.title('Curva Precision-Recall')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Treinando o modelo Random Forest
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train_tfidf, y_train)

# Previsão e probabilidade
y_pred_rf = model_rf.predict(X_test_tfidf)
y_pred_prob_rf = model_rf.predict_proba(X_test_tfidf)[:, 1]

# Cálculo das métricas
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_prob_rf)
pr_auc_rf = average_precision_score(y_test, y_pred_prob_rf)

print("Random Forest Metrics:")
print(f"Acurácia: {accuracy_rf:.4f}")
print(f"Precisão: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"AUC-ROC: {roc_auc_rf:.4f}")
print(f"AUC-PR: {pr_auc_rf:.4f}")

# Histogramas
thresholds = [0.05, 0.65]
bins = np.linspace(0, 1, 11)
counts_general, _ = np.histogram(y_pred_prob_rf, bins=bins)
counts_spam, _ = np.histogram(y_pred_prob_rf[y_test == 1], bins=bins)
percentages_general = counts_general / counts_general.sum() * 100
percentages_spam = counts_spam / counts_spam.sum() * 100
bar_width = 0.4
bar_positions = np.arange(len(bins) - 1)

plt.figure(figsize=(14, 6))

# Histograma de Probabilidades
plt.bar(bar_positions - bar_width/2, percentages_general, width=bar_width, color='blue', label='População Geral', alpha=0.7, edgecolor='black')
plt.bar(bar_positions + bar_width/2, percentages_spam, width=bar_width, color='red', label='Spam', alpha=0.7, edgecolor='black')
plt.axvline(thresholds[0] * (len(bins) - 1), color='green', linestyle='--', label=f'< {int(thresholds[0] * 100)}% Spam')
plt.axvline(thresholds[1] * (len(bins) - 1), color='green', linestyle='--', label=f'> {int(thresholds[1] * 100)}% Spam')
plt.title('Random Forest - Distribuição das Probabilidades Preditas')
plt.xlabel('Probabilidade de ser spam')
plt.ylabel('Percentual da População')
plt.xticks(bar_positions, labels=[f'{int(b * 100)}%' for b in bins[:-1]])
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

# Curvas ROC e PR
plt.figure(figsize=(14, 6))

# Curva ROC
fpr, tpr, _ = roc_curve(y_test, y_pred_prob_rf)
plt.subplot(1, 2, 1)
plt.plot(fpr, tpr, color='blue', label=f'Random Forest (AUC = {roc_auc_rf:.4f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Curva ROC')
plt.legend()

# Curva PR
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob_rf)
plt.subplot(1, 2, 2)
plt.plot(recall, precision, color='blue', label=f'Random Forest (AUC = {pr_auc_rf:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precisão')
plt.title('Curva Precision-Recall')
plt.legend()

plt.tight_layout()
plt.show()