In [8]:
# -*- coding: utf-8 -*-
"""MBTI Classification.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/18j7FyHgOVOlgiO7ryvw29EHsxOSBV6js
"""

from datasets import load_dataset , Dataset
import numpy as np
from transformers import DataCollatorWithPadding
raw_datasets = load_dataset("Legend0300/MBTI")


In [9]:
raw_datasets

raw_datasets["train"][0]["Type"]

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Type', 'Sentence', 'labels'],
        num_rows: 461
    })
})

In [10]:
def find_unique_labels(dataset, label_column):
    """
    Find unique labels in a dataset.

    Args:
    - dataset: The dataset containing the labels.
    - label_column: The column index or name containing the labels.

    Returns:
    - unique_labels: A list of unique labels.
    """
    if isinstance(label_column, str):
        labels = dataset[label_column]
    else:
        labels = [row[label_column] for row in dataset]
    unique_labels = list(set(labels))
    return unique_labels

def create_label_mappings(unique_labels):
    """
    Create label-to-id and id-to-label mappings.

    Args:
    - unique_labels: A list of unique labels.

    Returns:
    - label2id: A dictionary mapping labels to IDs.
    - id2label: A dictionary mapping IDs to labels.
    """
    label2id = {label: idx for idx, label in enumerate(unique_labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    return label2id, id2label

In [11]:
# Example usage:
# Assuming you have a dataset named 'data' and the label column is named 'labels'
# Replace 'data' and 'labels' with your actual dataset and label column name.

unique_labels = find_unique_labels(raw_datasets["train"], 'Type')
label2id, id2label = create_label_mappings(unique_labels)

# Now you can use label2id and id2label dictionaries to map labels to IDs and vice versa.

unique_labels


['INTP',
 'INTJ',
 'INFP',
 'ISFP',
 'ENFJ',
 'ESFJ',
 'INFJ',
 'ISTP',
 'ISTJ',
 'ENFP',
 'ESFP',
 'ENTP',
 'ESTJ',
 'ESTP',
 'ENTJ',
 'ISFJ']

In [12]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [13]:
inputs = tokenizer(raw_datasets["train"][0]["Sentence"])
inputs.tokens()

inputs.word_ids()

def tokenize(examples):
    tokenized_inputs = tokenizer(
        examples["Sentence"]
    )
    return tokenized_inputs

In [14]:
raw_datasets["train"].column_names

tokenized_datasets = raw_datasets.map(
    tokenize,
    batched=True,
)

tokenized_datasets

tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0', 'Type', 'Sentence'])

tokenized_datasets["train"]


Map:   0%|          | 0/461 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 461
})

In [15]:
import evaluate

metric = evaluate.load("seqeval")

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
from transformers import AutoModelForSequenceClassification
import torch.nn as nn

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=16,
    id2label=id2label,
    label2id=label2id,
)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import TrainingArguments

args = TrainingArguments(
    "MBTI-Classifier",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=15,
    weight_decay=0.01
    # push_to_hub=True,
    # push_to_hub_model_id="MBTI-Classifier"
)


In [19]:
from transformers import Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets['train'],  # Assuming you have split your dataset
    eval_dataset=tokenized_datasets['train'],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 2.7093217372894287, 'eval_accuracy': 0.27331887201735355, 'eval_runtime': 0.5922, 'eval_samples_per_second': 778.486, 'eval_steps_per_second': 97.944, 'epoch': 1.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 2.417574405670166, 'eval_accuracy': 0.3492407809110629, 'eval_runtime': 0.6363, 'eval_samples_per_second': 724.49, 'eval_steps_per_second': 91.151, 'epoch': 2.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 2.142958402633667, 'eval_accuracy': 0.4403470715835141, 'eval_runtime': 0.5967, 'eval_samples_per_second': 772.542, 'eval_steps_per_second': 97.196, 'epoch': 3.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.8911203145980835, 'eval_accuracy': 0.6095444685466378, 'eval_runtime': 0.6602, 'eval_samples_per_second': 698.243, 'eval_steps_per_second': 87.848, 'epoch': 4.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.6778485774993896, 'eval_accuracy': 0.648590021691974, 'eval_runtime': 0.6745, 'eval_samples_per_second': 683.507, 'eval_steps_per_second': 85.994, 'epoch': 5.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.4826545715332031, 'eval_accuracy': 0.7158351409978309, 'eval_runtime': 0.633, 'eval_samples_per_second': 728.272, 'eval_steps_per_second': 91.626, 'epoch': 6.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.2909785509109497, 'eval_accuracy': 0.7722342733188721, 'eval_runtime': 0.6447, 'eval_samples_per_second': 715.017, 'eval_steps_per_second': 89.959, 'epoch': 7.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.1575720310211182, 'eval_accuracy': 0.8264642082429501, 'eval_runtime': 0.6952, 'eval_samples_per_second': 663.125, 'eval_steps_per_second': 83.43, 'epoch': 8.0}
{'loss': 1.9841, 'grad_norm': 9.162494659423828, 'learning_rate': 8.505747126436782e-06, 'epoch': 8.62}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 1.0275213718414307, 'eval_accuracy': 0.8394793926247288, 'eval_runtime': 0.5937, 'eval_samples_per_second': 776.515, 'eval_steps_per_second': 97.696, 'epoch': 9.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.9174889922142029, 'eval_accuracy': 0.8611713665943601, 'eval_runtime': 0.6229, 'eval_samples_per_second': 740.131, 'eval_steps_per_second': 93.118, 'epoch': 10.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.8409842848777771, 'eval_accuracy': 0.8806941431670282, 'eval_runtime': 0.643, 'eval_samples_per_second': 716.929, 'eval_steps_per_second': 90.199, 'epoch': 11.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.7882339358329773, 'eval_accuracy': 0.8850325379609545, 'eval_runtime': 0.6248, 'eval_samples_per_second': 737.881, 'eval_steps_per_second': 92.835, 'epoch': 12.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.7408113479614258, 'eval_accuracy': 0.8980477223427332, 'eval_runtime': 0.6232, 'eval_samples_per_second': 739.745, 'eval_steps_per_second': 93.07, 'epoch': 13.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.7174103856086731, 'eval_accuracy': 0.9002169197396963, 'eval_runtime': 0.673, 'eval_samples_per_second': 684.988, 'eval_steps_per_second': 86.181, 'epoch': 14.0}


  0%|          | 0/58 [00:00<?, ?it/s]

{'eval_loss': 0.7091834545135498, 'eval_accuracy': 0.89587852494577, 'eval_runtime': 0.6805, 'eval_samples_per_second': 677.453, 'eval_steps_per_second': 85.233, 'epoch': 15.0}
{'train_runtime': 139.3221, 'train_samples_per_second': 49.633, 'train_steps_per_second': 6.245, 'train_loss': 1.5442709385663613, 'epoch': 15.0}


TrainOutput(global_step=870, training_loss=1.5442709385663613, metrics={'train_runtime': 139.3221, 'train_samples_per_second': 49.633, 'train_steps_per_second': 6.245, 'train_loss': 1.5442709385663613, 'epoch': 15.0})

In [26]:

# Make predictions
predictions = trainer.predict(tokenized_datasets['train'])

# `predictions` will contain the predicted label indices
# You can decode these indices to get the actual labels
predicted_label_id = np.argmax(predictions.predictions, axis=1)
predicted_label = id2label[predicted_label_id[0]]

print("Predicted MBTI Type:", predicted_label)

  0%|          | 0/58 [00:00<?, ?it/s]

Predicted MBTI Type: ISTJ


In [23]:
tokenized_datasets['train']


Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 461
})