In [1]:
import pandas as pd

In [21]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset


# load dataset

In [3]:
data_path=r"..\DATA\jutsus.jsonl"
df=pd.read_json(data_path, lines=True)
df.tail()

Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
2915,Agonizing Thorn Technique (Simply Put a Kanchō),"Taijutsu, Ninjutsu",After punching and kicking the opponent a few ...
2916,Afterimage Clone,"Ninjutsu, Clone Techniques","Shisui uses the Body Flicker Technique, and mo..."
2917,Afterglow,Kenjutsu,Sasuke dashes toward his opponent and quickly ...
2918,Aerobatic Strike,"Scientific Ninja Tool Techniques, Taijutsu","The user sends the opponent in the air, where ..."
2919,Adamantine Technique: Cranium Crusher,"Ninjutsu, Bukijutsu, Cooperation Ninjutsu","Hiruzen jumps up high to reach his opponent, a..."


In [4]:
def simplify_jutsu(jutsu):
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"

    if "Genjutsu" in jutsu:
        return "Genjutsu"

    if "Taijutsu" in jutsu:
        return "Taijutsu"

In [5]:
df['jutsu_type_simplified']=df["jutsu_type"].apply(simplify_jutsu)

In [6]:
df['jutsu_type_simplified'].value_counts()

jutsu_type_simplified
Ninjutsu    2269
Taijutsu     397
Genjutsu      87
Name: count, dtype: int64

In [7]:
df['text']=df["jutsu_name"]+'. '+df["jutsu_description"]
df['jutsu']=df['jutsu_type_simplified']
df=df[['text','jutsu']]
df=df.dropna()

In [8]:
df.head()

Unnamed: 0,text,jutsu
0,Adamantine Sealing Chains: Spiral Formation. K...,Ninjutsu
1,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu
2,Adamantine Prison Wall. After using Transforma...,Ninjutsu
3,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu
4,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",Taijutsu


In [9]:
from bs4 import BeautifulSoup

class Cleaner():
    def __init__(self):
        pass


    def put_line_breaks(self, text):
        return text.replace("<\p>" , "<\p>\n" )

    def remove_html_tag(self, text):
        clean_text= BeautifulSoup(text, "html.parser").text
        return clean_text


    def clean(self, text):
        text=self.put_line_breaks(text)
        text=self.remove_html_tag(text)
        text=text.strip()
        return text

In [10]:
text_column_name='text'
label_column_name='jutsu'

In [11]:
cleaner=Cleaner()
df['text_cleaned']=df[text_column_name].apply(cleaner.clean)

In [12]:
le=preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())

In [13]:
label_dict=  {i: name for i ,name in enumerate( le.__dict__['classes_'].tolist())}
label_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [14]:
df['label'] = le.transform(df[label_column_name].tolist())

In [15]:
df.head()

Unnamed: 0,text,jutsu,text_cleaned,label
0,Adamantine Sealing Chains: Spiral Formation. K...,Ninjutsu,Adamantine Sealing Chains: Spiral Formation. K...,1
1,Adamantine Power: Acala. Hashirama kicks the o...,Ninjutsu,Adamantine Power: Acala. Hashirama kicks the o...,1
2,Adamantine Prison Wall. After using Transforma...,Ninjutsu,Adamantine Prison Wall. After using Transforma...,1
3,Adamantine Seal: Monkey Yang Suppression. Afte...,Ninjutsu,Adamantine Seal: Monkey Yang Suppression. Afte...,1
4,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",Taijutsu,"Acrobat. The Acrobat (荒繰鷺伐刀, Akurobatto) is a ...",2


In [16]:
test_size=0.2

df_train , df_test = train_test_split(
    df,
    test_size=test_size,
    stratify=df['label']
)

In [17]:
model_name = "distilbert/distilbert-base-uncased"

In [19]:
tokenizer=AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [20]:
def preprocess_function(tokenizer , examples):
    return tokenizer(examples['text_cleaned'] , truncation=True)

In [22]:
train_dataset=Dataset.from_pandas(df_train)
test_dataset=Dataset.from_pandas(df_test)

In [23]:
tokenizer_train=train_dataset.map(lambda examples: preprocess_function(tokenizer , examples),
                                batched=True  )


tokenizer_test=test_dataset.map(lambda examples: preprocess_function(tokenizer , examples),
                                batched=True  )                              

Map:   0%|          | 0/2202 [00:00<?, ? examples/s]

Map:   0%|          | 0/551 [00:00<?, ? examples/s]