In [1]:
!pip install transformers
!pip install datasets
!pip install --upgrade pandas
!pip install evaluate



In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Configuration

In [3]:
import torch

In [4]:
data_path = "jutsus.jsonl" #@param {type:"string"}
text_column_name = "text" #@param {type:"string"}
label_column_name = "jutsu" #@param {type:"string"}

model_name = "distilbert-base-uncased" #@param {type:"string"}
test_size = 0.2 #@param {type:"number"}
num_labels = 3 #@param {type:"number"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'

Prepare the Dataset

In [5]:
import pandas as pd

In [6]:
df = pd.read_json(data_path, lines=True)

In [7]:
df.head()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,All Weapons Above Heaven,Ninjutsu,This technique raises all the status boosts (S...
1,Air Raid Shot,Ninjutsu,"Kankurō's puppet, Karasu, soars into the air w..."
2,Akuta,"Ninjutsu, Kinjutsu, Hiden",Akuta is an Earth Release technique that's cre...
3,Air Lightning Bullet,"Taijutsu, Shurikenjutsu",The user punches the opponent twice with their...
4,Air Gold Dust Protective Wall,"Kekkei Genkai, Ninjutsu","Making use of his Gold Dust, the Fourth Kazeka..."


In [8]:
def simplify_justu(jutsu):
    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'

    return None

In [9]:
df['jutsu_type_simplified'] = df['jutsu_type'].apply(simplify_justu)

In [10]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    1860
Taijutsu     580
Genjutsu      93
Name: count, dtype: int64

In [11]:
df['text'] = df['jutsu_name']+'. '+df['jutsu_description']

In [12]:
df['jutsu'] = df['jutsu_type_simplified']

In [13]:
df= df[['text','jutsu']]

In [14]:

df = df.dropna()

Clean Dataset

In [15]:
from bs4 import BeautifulSoup

In [16]:
class Cleaner():
  def __init__(self):
    pass
  def put_line_breaks(self,text):
    text = text.replace('','\n')
    return text
  def remove_html_tags(self,text):
    cleantext = BeautifulSoup(text, "lxml").text
    return cleantext
  def clean(self,text):
    text = self.put_line_breaks(text)
    text = self.remove_html_tags(text)
    return text

In [17]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)

  cleantext = BeautifulSoup(text, "lxml").text


In [18]:
df['jutsu'].value_counts()

jutsu
Ninjutsu    1860
Taijutsu     580
Genjutsu      93
Name: count, dtype: int64

Label Encoder

In [19]:
from sklearn import preprocessing

In [20]:
# Initialize a LabelEncoder object
le = preprocessing.LabelEncoder()
# Fit the LabelEncoder to the list of labels in the specified column of the DataFrame
le.fit(df[label_column_name].tolist())
# Transform the labels in the specified column of the DataFrame using the fitted LabelEncoder
df['label'] = le.transform(df[label_column_name].tolist())

In [21]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,All Weapons Above Heaven. This technique raise...,Ninjutsu,A\nl\nl\n \nW\ne\na\np\no\nn\ns\n \nA\nb\no\nv...,1
1,"Air Raid Shot. Kankurō's puppet, Karasu, soars...",Ninjutsu,A\ni\nr\n \nR\na\ni\nd\n \nS\nh\no\nt\n.\n \nK...,1
2,Akuta. Akuta is an Earth Release technique tha...,Ninjutsu,A\nk\nu\nt\na\n.\n \nA\nk\nu\nt\na\n \ni\ns\n ...,1
3,Air Lightning Bullet. The user punches the opp...,Taijutsu,A\ni\nr\n \nL\ni\ng\nh\nt\nn\ni\nn\ng\n \nB\nu...,2
4,Air Gold Dust Protective Wall. Making use of h...,Ninjutsu,A\ni\nr\n \nG\no\nl\nd\n \nD\nu\ns\nt\n \nP\nr...,1


Class weights

In [22]:
from sklearn.utils.class_weight import compute_class_weight

In [23]:
# Compute class weights using the 'balanced' mode
class_weights = compute_class_weight('balanced',
                     classes=sorted(df['label'].unique().tolist()),
                     y=df['label'].tolist()).tolist()


In [24]:
class_weights

[9.078853046594983, 0.4539426523297491, 1.4557471264367816]

Train and Test Split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# Split the DataFrame into training and testing sets using train_test_split
df_train,df_test = train_test_split(df,test_size=test_size,stratify=df['label'])

In [27]:
from datasets import Dataset

In [28]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

Tokenizer

In [29]:
from transformers import AutoTokenizer

In [30]:
# Initialize the tokenizer using the specified model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a preprocess function to tokenize the input text
def preprocess_function(examples):
    return tokenizer(examples["text_cleaned"], truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [31]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2026 [00:00<?, ? examples/s]

In [32]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Create Model

In [33]:
from transformers import AutoModelForSequenceClassification

In [34]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the Model

In [35]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
import torch
from torch import nn

In [36]:
!pip install transformers[torch]



In [37]:
!pip install accelerate -U



In [38]:
# Create a data collator with padding using the tokenizer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [39]:
# Load the accuracy metric from the evaluation module
metric = evaluate.load("accuracy")
# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
    # Unpack the evaluation predictions
    logits, labels = eval_pred
    # Compute predictions by selecting the index wiht the highest logit value
    predictions = np.argmax(logits, axis=-1)
    # Compute the specified metric using the computed predictions and references labels
    return metric.compute(predictions=predictions, references=labels)

In [40]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels from inputs
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device=device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        # Return loss and outputs if specified
        return (loss, outputs) if return_outputs else loss

In [41]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1447,1.201992,0.228797
2,1.1012,1.198615,0.228797
3,1.1378,1.062501,0.228797
4,1.0892,1.131414,0.228797
5,1.1348,1.097139,0.733728


TrainOutput(global_step=1270, training_loss=1.1215345007228101, metrics={'train_runtime': 259.4356, 'train_samples_per_second': 39.046, 'train_steps_per_second': 4.895, 'total_flos': 1271058223685364.0, 'train_loss': 1.1215345007228101, 'epoch': 5.0})

In [43]:
trainer.save_model('jutsu_model')

Evaluating the model

In [44]:
from sklearn.metrics import classification_report

In [45]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        74
           1       0.73      1.00      0.85      1488
           2       0.00      0.00      0.00       464

    accuracy                           0.73      2026
   macro avg       0.24      0.33      0.28      2026
weighted avg       0.54      0.73      0.62      2026



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.73      1.00      0.85       372
           2       0.00      0.00      0.00       116

    accuracy                           0.73       507
   macro avg       0.24      0.33      0.28       507
weighted avg       0.54      0.73      0.62       507



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
