In [2]:
import nltk
# Ensure nltk punkt tokenizer is downloaded (needed to split paragraphs into sentences)
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pandas as pd
import nltk
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# 1. Carregar o dataset
essays_df = pd.read_csv("C:/Users/moura/Projetos/Colab-Essays/dados/essays.csv", encoding="latin-1")

# 2. Pré-processamento

# Dividir os ensaios em sentenças
essays_df['sentences'] = essays_df['TEXT'].apply(nltk.sent_tokenize)

# Converter 'y' e 'n' para '1' e '0'
label_columns = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
binarizer = LabelBinarizer()
for col in label_columns:
    essays_df[col] = binarizer.fit_transform(essays_df[col])

# Obter embeddings das sentenças usando SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
essays_df['embeddings'] = essays_df['sentences'].apply(lambda sentences: [model.encode(sentence) for sentence in sentences])

# Agregar embeddings para representar o ensaio completo
essays_df['essay_embedding'] = essays_df['embeddings'].apply(lambda embeddings: sum(embeddings)/len(embeddings))

# 3. Dividir o dataset
X = list(essays_df['essay_embedding'])
y = essays_df[label_columns].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Construir e treinar o modelo
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Validar o modelo
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_columns))


  from .autonotebook import tqdm as notebook_tqdm


Accuracy: 0.058704453441295545
              precision    recall  f1-score   support

        cEXT       0.60      0.53      0.56       272
        cNEU       0.55      0.64      0.59       232
        cAGR       0.57      0.67      0.62       264
        cCON       0.60      0.51      0.55       267
        cOPN       0.64      0.59      0.62       267

   micro avg       0.59      0.59      0.59      1302
   macro avg       0.59      0.59      0.59      1302
weighted avg       0.59      0.59      0.59      1302
 samples avg       0.57      0.58      0.54      1302



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pickle

# Após treinar o modelo
clf.fit(X_train, y_train)

# Salvar o modelo em um arquivo
with open("Essays_forest.pkl", "wb") as file:
    pickle.dump(clf, file)

