In [73]:
# -*- coding: utf-8 -*-
"""MBTI Classification.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/18j7FyHgOVOlgiO7ryvw29EHsxOSBV6js
"""

from datasets import load_dataset , Dataset
import numpy as np
from transformers import DataCollatorWithPadding
raw_datasets = load_dataset("Legend0300/MBTI")


test_dataset = load_dataset("Legend0300/MBTItest")


Downloading readme:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 119/119 [00:00<00:00, 438B/s]


Generating train split: 0 examples [00:00, ? examples/s]

In [51]:
raw_datasets

raw_datasets["train"][0]["Type"]

test_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Type', 'Sentence', 'labels'],
        num_rows: 1
    })
})

In [64]:
def find_unique_labels(dataset, label_column):
    """
    Find unique labels in a dataset.

    Args:
    - dataset: The dataset containing the labels.
    - label_column: The column index or name containing the labels.

    Returns:
    - unique_labels: A list of unique labels.
    """
    if isinstance(label_column, str):
        labels = dataset[label_column]
    else:
        labels = [row[label_column] for row in dataset]
    unique_labels = list(set(labels))
    return unique_labels

def create_label_mappings(unique_labels):
    """
    Create label-to-id and id-to-label mappings.

    Args:
    - unique_labels: A list of unique labels.

    Returns:
    - label2id: A dictionary mapping labels to IDs.
    - id2label: A dictionary mapping IDs to labels.
    """
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label

In [74]:
# Example usage:
# Assuming you have a dataset named 'data' and the label column is named 'labels'
# Replace 'data' and 'labels' with your actual dataset and label column name.

unique_labels = find_unique_labels(raw_datasets["train"], 'Type')
label2id, id2label = create_label_mappings(unique_labels)

# Now you can use label2id and id2label dictionaries to map labels to IDs and vice versa.

unique_labels


['INTP',
 'INTJ',
 'INFP',
 'ISFP',
 'ENFJ',
 'ESFJ',
 'INFJ',
 'ISTP',
 'ISTJ',
 'ENFP',
 'ESFP',
 'ENTP',
 'ESTJ',
 'ESTP',
 'ENTJ',
 'ISFJ']

In [75]:
label2id

{'INTP': 0,
 'INTJ': 1,
 'INFP': 2,
 'ISFP': 3,
 'ENFJ': 4,
 'ESFJ': 5,
 'INFJ': 6,
 'ISTP': 7,
 'ISTJ': 8,
 'ENFP': 9,
 'ESFP': 10,
 'ENTP': 11,
 'ESTJ': 12,
 'ESTP': 13,
 'ENTJ': 14,
 'ISFJ': 15}

In [54]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [66]:
inputs = tokenizer(raw_datasets["train"][0]["Sentence"])
inputs.tokens()

inputs.word_ids()

def tokenize(examples):
    tokenized_inputs = tokenizer(
        examples["Sentence"]
    )
    return tokenized_inputs

In [76]:
raw_datasets["train"].column_names

tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
)

tokenized_datasets_test = test_dataset.map(
    tokenize,
    batched=True,
)

tokenized_datasets

tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0', 'Type', 'Sentence'])

tokenized_datasets_test["train"]


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'Type', 'Sentence', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 1
})

In [57]:
import evaluate

metric = evaluate.load("seqeval")

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [58]:
from transformers import AutoModelForSequenceClassification
import torch.nn as nn

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=16,
    id2label=id2label,
    label2id=label2id,
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [59]:
from transformers import TrainingArguments

args = TrainingArguments(
    "MBTI-Classifier",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=15,
    weight_decay=0.01
    # push_to_hub=True,
    # push_to_hub_model_id="MBTI-Classifier"
)


In [None]:
from transformers import Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],  # Assuming you have split your dataset
    eval_dataset=tokenized_datasets['train'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

In [77]:

# Make predictions
predictions = trainer.predict(tokenized_datasets_test["train"])

# `predictions` will contain the predicted label indices
# You can decode these indices to get the actual labels
predicted_label_id = np.argmax(predictions.predictions, axis=1)
predicted_label = id2label[predicted_label_id[0]]

print("Predicted MBTI Type:", predicted_label)

  0%|          | 0/1 [00:00<?, ?it/s]

Predicted MBTI Type: ENFP


In [70]:
tokenized_datasets['train']


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 461
})