### Instalar dependências

In [83]:
!pip install spacy scikit-learn nltk
!python -m spacy download pt_core_news_sm
!pip install pandas

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m27.4 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


### Importar bibliotecas

In [None]:
import sys
import os

# Adiciona o diretório raiz do projeto ao sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import pickle
import pandas as pd
from pathlib import Path

from api.scripts.emailLematizer import EmailLematizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import FunctionTransformer


### Dados de exemplo


In [None]:
notebook_dir = Path().resolve()  # se estiver rodando de dentro da pasta do notebook

df = pd.read_csv(notebook_dir / "emails_produtivo_improdutivo.csv")
texts = df["email"].values
labels = df["label"].values

### Separar treino e teste


In [86]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.25, random_state=42)

### Criar pipeline TF-IDF + Classificador 


In [87]:

preprocessor = EmailLematizer()

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(preprocessor=preprocessor.lemmatize_text)),
    ("clf", MultinomialNB())
])

### Treinar modelo


In [88]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,<bound method...7f9122e5a490>>
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


### Avaliar modelo


In [89]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

 improdutivo       1.00      1.00      1.00        24
   produtivo       1.00      1.00      1.00        26

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50



### Salvar modelo em pickle 


In [90]:
with open("../api/models/email_classifier_pt.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Modelo salvo em 'email_classifier_pt.pkl'")

Modelo salvo em 'email_classifier_pt.pkl'
