<a href="https://colab.research.google.com/github/LukeBarboza/AI-Codes/blob/HandsOn_Classificacao_Email_or_Spam/Classificacao_de_email_ou_Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Importação das bibliotecas necessárias
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Returning values from sklearn
!wget https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
!wget https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2

--2025-03-03 20:02:01--  https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1612216 (1.5M) [application/x-bzip2]
Saving to: ‘20030228_easy_ham.tar.bz2.1’


2025-03-03 20:02:01 (113 MB/s) - ‘20030228_easy_ham.tar.bz2.1’ saved [1612216/1612216]

--2025-03-03 20:02:01--  https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1183768 (1.1M) [application/x-bzip2]
Saving to: ‘20030228_spam.tar.bz2.1’


2025-03-03 20:02:01 (100 MB/s) - ‘20030228_spam.tar.bz2.1’ saved [1183

In [None]:
!tar -xjf 20030228_easy_ham.tar.bz2
!tar -xjf 20030228_spam.tar.bz2

In [None]:
import os
import pandas as pd

# Caminhos dos diretórios extraídos
ham_dir = "/content/easy_ham"
spam_dir = "/content/spam"

# Função para ler os emails e rotulá-los
def load_emails_from_directory(directory, label):
    emails = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', errors='ignore') as f:
            emails.append((f.read(), label))
    return emails

# Carregar os emails de "ham" e "spam"
ham_emails = load_emails_from_directory(ham_dir, 'ham')
spam_emails = load_emails_from_directory(spam_dir, 'spam')

# Combinar os dois conjuntos de dados
all_emails = ham_emails + spam_emails

# Criar um DataFrame
df = pd.DataFrame(all_emails, columns=["email", "label"])
df.head()


Unnamed: 0,email,label
0,From rpm-list-admin@freshrpms.net Mon Sep 30 ...,ham
1,From fork-admin@xent.com Tue Oct 1 16:28:55 ...,ham
2,From fork-admin@xent.com Mon Sep 9 10:46:27 ...,ham
3,From rssfeeds@jmason.org Thu Oct 3 12:25:06 ...,ham
4,From rpm-list-admin@freshrpms.net Wed Oct 9 ...,ham


In [None]:
#CUSTOM TRANSFORMER
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # Não há nada para aprender, então apenas retornamos o próprio transformador
        return self

    def transform(self, X):
        # Aqui, aplicamos uma transformação nos dados
        # Por exemplo, multiplicar todos os valores por 2
        return X * 2

In [None]:
import re

# Função para pré-processar o texto
def preprocess_text(text):
    text = text.lower()  # Converter para minúsculas
    text = re.sub(r'\W', ' ', text)  # Remover caracteres não alfanuméricos
    text = re.sub(r'\s+', ' ', text)  # Remover múltiplos espaços
    return text

df['processed_email'] = df['email'].apply(preprocess_text)
df.head()


Unnamed: 0,email,label,processed_email
0,From rpm-list-admin@freshrpms.net Mon Sep 30 ...,ham,from rpm list admin freshrpms net mon sep 30 1...
1,From fork-admin@xent.com Tue Oct 1 16:28:55 ...,ham,from fork admin xent com tue oct 1 16 28 55 20...
2,From fork-admin@xent.com Mon Sep 9 10:46:27 ...,ham,from fork admin xent com mon sep 9 10 46 27 20...
3,From rssfeeds@jmason.org Thu Oct 3 12:25:06 ...,ham,from rssfeeds jmason org thu oct 3 12 25 06 20...
4,From rpm-list-admin@freshrpms.net Wed Oct 9 ...,ham,from rpm list admin freshrpms net wed oct 9 10...


In [None]:
from sklearn.model_selection import train_test_split

X = df['processed_email']
y = df['label']

# Dividir em 80% treinamento e 20% teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape


((2401,), (601,))

In [None]:
# Pipeline: Transformação de texto -> Modelo de classificação
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),  # Etapa de vetorização
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))  # Classificador
])

In [None]:
# Treinar o modelo usando o pipeline
pipeline.fit(X_train, y_train)

# Avaliar a precisão do modelo
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9866888519134775
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       497
        spam       0.99      0.93      0.96       104

    accuracy                           0.99       601
   macro avg       0.99      0.97      0.98       601
weighted avg       0.99      0.99      0.99       601

