In [1]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv('MBTI.csv')  # Adjust the path to your dataset file

# Function to sample rows
def sample_rows(group, max_size=1000):
    if len(group) > max_size:
        return group.sample(n=max_size, random_state=1)  # Using a random state for reproducibility
    return group

# Group by 'type' and apply sampling function
sampled_df = df.groupby('type').apply(sample_rows).reset_index(drop=True)

# Save the sampled dataframe to a new CSV file
sampled_df.to_csv('MBTI_sampled.csv', index=False)

print("Sampled data saved to 'MBTI_sampled.csv'.")


  sampled_df = df.groupby('type').apply(sample_rows).reset_index(drop=True)


Sampled data saved to 'sampled_dataset.csv'.


In [10]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

# Load dataset
data = pd.read_csv('MBTI.csv')

# Simple tokenization and vocabulary mapping
def tokenize(text):
    return text.split()

# Build vocabulary
vocab = {}
def build_vocab(texts):
    global vocab
    idx = 0
    for text in texts:
        tokens = tokenize(text)
        for token in tokens:
            if token not in vocab:
                vocab[token] = idx
                idx += 1

build_vocab(data['posts'])

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, vocab):
        self.texts = [torch.tensor([vocab.get(token, 0) for token in tokenizer(text)], dtype=torch.long) for text in texts]  # use vocab.get to handle unknown tokens
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Tokenizer and label map
tokenizer = tokenize
label_map = {label: idx for idx, label in enumerate(data['type'].unique())}
labels = [label_map[label] for label in data['type']]

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(data['posts'], labels, test_size=0.2)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, vocab)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, vocab)

# Custom collate function to pad sequences
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(_label)
        text_list.append(_text)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    label_list = torch.tensor(label_list, dtype=torch.long)
    return text_list, label_list

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)

# Define the neural network model
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
    
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = embedded.mean(dim=1)
        return self.fc(pooled)

# Model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
model = TextClassifier(len(vocab), 100, len(label_map)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.2f}')

# Function to predict the MBTI type from a sentence
def predict_mbti(sentence):
    model.eval()
    tokens = [vocab.get(token, 0) for token in tokenize(sentence)]  # handle unknown tokens
    padded_tokens = pad_sequence([torch.tensor(tokens)], batch_first=True, padding_value=0).to(device)
    with torch.no_grad():
        output = model(padded_tokens)
        predicted_index = output.argmax(1).item()
    return list(label_map.keys())[list(label_map.values()).index(predicted_index)]

# Example usage
input_sentence = ""
mbti_type = predict_mbti(input_sentence)
print(f"The predicted MBTI type is: {mbti_type}")


Using device: cuda
Epoch 1, Loss: 1.679336964836847
Epoch 2, Loss: 0.8950034956002487
Epoch 3, Loss: 0.6381449456330878
Epoch 4, Loss: 0.5209247043038835
Epoch 5, Loss: 0.4424021894429694
Accuracy on test set: 0.83
The predicted MBTI type is: ESFP
