In [3]:
import joblib as jb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

preprocessedData = jb.load("preprocessed_data.lzma")
preprocessedData = preprocessedData.head(10_000)

X = preprocessedData['processed_text']
Y = preprocessedData['personality']

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)


# Datos de entrenamiento
bagOfWordsModel = TfidfVectorizer()
X_train = bagOfWordsModel.fit_transform(X_train)
print("X_train bag of words:")
print(X_train.shape)

# Datos pruebas
X_test = bagOfWordsModel.transform(X_test)
print("X_test bag of words:")
print(X_test.shape)


svd = TruncatedSVD(n_components=2000, n_oversamples=15, random_state=42)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)

print(X_train.shape)
print(X_test.shape)

X_train bag of words:
(8000, 8769)
X_test bag of words:
(2000, 8769)
(8000, 2000)
(2000, 2000)


In [4]:
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)

X_train.shape: (8000, 2000)
y_train.shape: (8000,)


In [5]:
del preprocessedData, X, Y

In [6]:
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train_encoded, dtype=torch.long)  # Usar torch.long para clases múltiples

input_size = 32
hidden_size = 64

class MyClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(2000, 3000)
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(3000, 2000)
        self.act2 = nn.ReLU()
        self.hidden3 = nn.GRU(2000, 1000, batch_first=True)  # Corrección: especificar batch_first=True para la entrada
        self.hidden4 = nn.RNN(1000, 1200, batch_first=True)  # Corrección: especificar batch_first=True para la entrada
        self.act4 = nn.ReLU()
        self.hidden5 = nn.Linear(1200, 77)
        self.act5 = nn.ReLU()
        self.output = nn.Linear(77, 16)

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        
        # Pasar a través de la capa GRU
        x, _ = self.hidden3(x)
        
        # Pasar a través de la capa RNN
        x, _ = self.hidden4(x)
        x = self.act4(x)
        
        x = self.act5(self.hidden5(x))
        x = self.output(x)
        return x


In [11]:
import torch

if torch.cuda.is_available():
    device = torch.device("cpu")
    print("GPU disponible")
else:
    device = torch.device("cpu")
    print("Usando CPU")

X_train = X_train.to(device)
y_train = y_train.to(device)

model = MyClassifier().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Convertir y_train a torch.long si aún no lo está
y_train = y_train.type(torch.long)


GPU disponible


In [12]:
n_epochs = 100
batch_size = 10
 
for epoch in range(n_epochs):
    for i in range(0, len(X_train), batch_size):
        Xbatch = X_train[i:i+batch_size]
        ybatch = y_train[i:i+batch_size]
        optimizer.zero_grad()
        y_pred = model(Xbatch)
        loss = loss_fn(y_pred, ybatch)
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

KeyboardInterrupt: 

In [36]:
#label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.long)

In [41]:
# compute accuracy (no_grad is optional)
from sklearn.metrics import classification_report

model.eval()
with torch.no_grad():
    y_pred_probs = model(X_test)
    _, y_pred = torch.max(y_pred_probs, 1)

y_pred = y_pred.numpy()
y_true = y_test.numpy()
 
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       0.0
           2       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00    1005.0
           8       0.00      0.00      0.00     876.0
          16       0.00      0.00      0.00     119.0

    accuracy                           0.00    2000.0
   macro avg       0.00      0.00      0.00    2000.0
weighted avg       0.00      0.00      0.00    2000.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
