In [1]:
import pandas as pd
essays_df = pd.read_csv('C:/Users/moura/Projetos/Colab-Essays/dados/mypersonality_final.csv', encoding='latin-1')
essays_df = essays_df[['#AUTHID','STATUS', 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']]

In [2]:
import pandas as pd
import nltk
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer


# 2. Pré-processamento

# Dividir os ensaios em sentenças
essays_df['sentences'] = essays_df['STATUS'].apply(nltk.sent_tokenize)

# Converter 'y' e 'n' para '1' e '0'
label_columns = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
binarizer = LabelBinarizer()
for col in label_columns:
    essays_df[col] = binarizer.fit_transform(essays_df[col])

# Obter embeddings das sentenças usando SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
essays_df['embeddings'] = essays_df['sentences'].apply(lambda sentences: [model.encode(sentence) for sentence in sentences])

# Agregar embeddings para representar o ensaio completo
essays_df['essay_embedding'] = essays_df['embeddings'].apply(lambda embeddings: sum(embeddings)/len(embeddings))

# 3. Dividir o dataset
X = list(essays_df['essay_embedding'])
y = essays_df[label_columns].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Construir e treinar o modelo
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Validar o modelo
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_columns))


  from .autonotebook import tqdm as notebook_tqdm


Accuracy: 0.08518145161290322
              precision    recall  f1-score   support

        cEXT       0.52      0.14      0.22       853
        cNEU       0.62      0.05      0.09       729
        cAGR       0.58      0.68      0.63      1057
        cCON       0.56      0.28      0.38       931
        cOPN       0.76      1.00      0.86      1501

   micro avg       0.66      0.52      0.58      5071
   macro avg       0.61      0.43      0.43      5071
weighted avg       0.63      0.52      0.50      5071
 samples avg       0.68      0.51      0.55      5071



  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import pickle

# Após treinar o modelo
clf.fit(X_train, y_train)

# Salvar o modelo em um arquivo
with open("mypersonality_forest.pkl", "wb") as file:
    pickle.dump(clf, file)


Accuracy: 0.08518145161290322
              precision    recall  f1-score   support

        cEXT       0.52      0.14      0.22       853
        cNEU       0.62      0.05      0.09       729
        cAGR       0.58      0.68      0.63      1057
        cCON       0.56      0.28      0.38       931
        cOPN       0.76      1.00      0.86      1501

   micro avg       0.66      0.52      0.58      5071
   macro avg       0.61      0.43      0.43      5071
weighted avg       0.63      0.52      0.50      5071
 samples avg       0.68      0.51      0.55      5071

In [None]:
import pandas as pd
import nltk
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer

# 1. Carregar o dataset
essays_df = pd.read_csv("C:/Users/moura/Projetos/Colab-Essays/dados/essays.csv", encoding="latin-1")

# 2. Pré-processamento

# Dividir os ensaios em sentenças
essays_df['sentences'] = essays_df['TEXT'].apply(nltk.sent_tokenize)

# Converter 'y' e 'n' para '1' e '0'
label_columns = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']
binarizer = LabelBinarizer()
for col in label_columns:
    essays_df[col] = binarizer.fit_transform(essays_df[col])

# Obter embeddings das sentenças usando SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
essays_df['embeddings'] = essays_df['sentences'].apply(lambda sentences: [model.encode(sentence) for sentence in sentences])

# Agregar embeddings para representar o ensaio completo
essays_df['essay_embedding'] = essays_df['embeddings'].apply(lambda embeddings: sum(embeddings)/len(embeddings))

# 3. Dividir o dataset
X = list(essays_df['essay_embedding'])
y = essays_df[label_columns].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Construir, configurar e treinar o modelo com GridSearchCV
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 5. Validar o modelo
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_columns))
print("Melhores hiperparâmetros:", grid_search.best_params_)
