In [41]:
import pandas as pd

In [42]:
## Load Dataset. 

data_path = '../data/jutsus.jsonl'

dataframe = pd.read_json(data_path, lines=True)
dataframe.head()

Unnamed: 0,jutsu_name,jutsu_type,justsu_desc
0,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...
1,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc..."
2,Apricot Blossom Droop,"Taijutsu, Ninjutsu",Kurenai attacks her opponent with a combinatio...
3,Area Scanning Technique,Ninjutsu,By touching a solid surface with their hand or...
4,Aqua Shoot,Ninjutsu,The user gathers a small orb of water and kick...


In [43]:
def simplify_jutsu_data(jutsu):

    ''' '''

    if 'Genjutsu' in jutsu:
        return 'Genjutsu'
    if 'Ninjutsu' in jutsu:
        return 'Ninjutsu'
    if 'Taijutsu' in jutsu:
        return 'Taijutsu'

In [44]:
dataframe['jutsu_simplified'] = dataframe['jutsu_type'].apply(simplify_jutsu_data)
dataframe.head(10)

Unnamed: 0,jutsu_name,jutsu_type,justsu_desc,jutsu_simplified
0,Arm of Shukaku,Ninjutsu,Gaara engulfs himself in a sand barrier as Shu...,Ninjutsu
1,Armageddon Countdown Clock,Ninjutsu,"After performing the hand seal, a special cloc...",Ninjutsu
2,Apricot Blossom Droop,"Taijutsu, Ninjutsu",Kurenai attacks her opponent with a combinatio...,Ninjutsu
3,Area Scanning Technique,Ninjutsu,By touching a solid surface with their hand or...,Ninjutsu
4,Aqua Shoot,Ninjutsu,The user gathers a small orb of water and kick...,Ninjutsu
5,Arhat Fist,"Taijutsu, Fighting Style",This fighting style relies on Jirōbō's brute s...,Taijutsu
6,Antlion Ninja Arts: Ephemeral,"Ninjutsu, Kinjutsu",This article is about the anime-only kinjutsu ...,Ninjutsu
7,Arm Growth Technique,Ninjutsu,"From the shoulders, the user is able to create...",Ninjutsu
8,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
9,Animal Path,"Kekkei Genkai, Ninjutsu, Dōjutsu",This article is about the ability. For other ...,Ninjutsu


In [45]:
dataframe['jutsu_simplified'].value_counts()

jutsu_simplified
Ninjutsu    2258
Taijutsu     397
Genjutsu     101
Name: count, dtype: int64

In [46]:
dataframe['text'] = dataframe['jutsu_name'] + '. ' + dataframe['justsu_desc']
dataframe['jutsus'] = dataframe['jutsu_simplified']

dataframe = dataframe[['text', 'jutsus']]
dataframe = dataframe.dropna()

In [47]:
dataframe.head()

Unnamed: 0,text,jutsus
0,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu
1,Armageddon Countdown Clock. After performing t...,Ninjutsu
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu
3,Area Scanning Technique. By touching a solid s...,Ninjutsu
4,Aqua Shoot. The user gathers a small orb of wa...,Ninjutsu


In [48]:
from bs4 import BeautifulSoup


class DataCleaner():

    def __init__(self) -> None:
        pass 

    
    def insert_line_breaks(self, text):
        return text.replace('<\p>', '<\p>\n')
    

    def remove_html_tags(self, text):
        return BeautifulSoup(text, 'lxml').text

    
    def clean(self, text):
        text = self.insert_line_breaks(text=text)
        text = self.remove_html_tags(text=text)
        text = text.strip()
        return text

In [49]:
text_column_name = 'text'
label_column_name = 'jutsus'

cleaner = DataCleaner()

dataframe['processed_text'] = dataframe[text_column_name].apply(cleaner.clean)

  return BeautifulSoup(text, 'lxml').text


In [50]:
dataframe

Unnamed: 0,text,jutsus,processed_text
0,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu,Arm of Shukaku. Gaara engulfs himself in a san...
1,Armageddon Countdown Clock. After performing t...,Ninjutsu,Armageddon Countdown Clock. After performing t...
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu,Apricot Blossom Droop. Kurenai attacks her opp...
3,Area Scanning Technique. By touching a solid s...,Ninjutsu,Area Scanning Technique. By touching a solid s...
4,Aqua Shoot. The user gathers a small orb of wa...,Ninjutsu,Aqua Shoot. The user gathers a small orb of wa...
...,...,...,...
2919,Absorption Sphere. Using the Jutsu Absorption ...,Ninjutsu,Absorption Sphere. Using the Jutsu Absorption ...
2920,Absolute: Fang Passing Fang. Kiba and Akamaru ...,Taijutsu,Absolute: Fang Passing Fang. Kiba and Akamaru ...
2921,100% Single Punch. Tsunade gathers large amoun...,Taijutsu,100% Single Punch. Tsunade gathers large amoun...
2922,100 Metre Punch. A shorter version of the 1000...,Taijutsu,100 Metre Punch. A shorter version of the 1000...


In [51]:
from sklearn import preprocessing

# Encode labels as part of model preprocessing. 

lbl_encoder = preprocessing.LabelEncoder()
lbl_encoder.fit(dataframe[label_column_name].tolist())

In [52]:
lbl_dict = { index:label for index, label in enumerate(lbl_encoder.__dict__['classes_'].tolist()) }
lbl_dict

{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [53]:
dataframe['label'] = lbl_encoder.transform(dataframe[label_column_name].tolist())

In [54]:
dataframe.head()

Unnamed: 0,text,jutsus,processed_text,label
0,Arm of Shukaku. Gaara engulfs himself in a san...,Ninjutsu,Arm of Shukaku. Gaara engulfs himself in a san...,1
1,Armageddon Countdown Clock. After performing t...,Ninjutsu,Armageddon Countdown Clock. After performing t...,1
2,Apricot Blossom Droop. Kurenai attacks her opp...,Ninjutsu,Apricot Blossom Droop. Kurenai attacks her opp...,1
3,Area Scanning Technique. By touching a solid s...,Ninjutsu,Area Scanning Technique. By touching a solid s...,1
4,Aqua Shoot. The user gathers a small orb of wa...,Ninjutsu,Aqua Shoot. The user gathers a small orb of wa...,1


In [55]:
from sklearn.model_selection import train_test_split

test_size=0.2

dataframe_train, dataframe_test = train_test_split(dataframe, test_size=test_size, stratify=dataframe['label'])

In [56]:
dataframe_train['jutsus'].value_counts()

jutsus
Ninjutsu    1806
Taijutsu     317
Genjutsu      81
Name: count, dtype: int64

In [57]:
from transformers import AutoTokenizer

model_name = 'distilbert/distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)




In [58]:
def process_function(tokenizer, examples):
    ''' '''


    return tokenizer(examples['processed_text'], truncation=True)

In [59]:
from datasets import Dataset

## Convert pandas into hugging face dataset making appropriate for models. 

training_dataset = Dataset.from_pandas(dataframe_train)
testing_dataset = Dataset.from_pandas(dataframe_test)

tokenized_training_set = training_dataset.map(lambda examples: process_function(tokenizer=tokenizer, examples=examples), batched=True)
tokenized_testing_set = testing_dataset.map(lambda examples: process_function(tokenizer=tokenizer, examples=examples), batched=True)

Map: 100%|██████████| 2204/2204 [00:00<00:00, 8103.08 examples/s]
Map: 100%|██████████| 552/552 [00:00<00:00, 8118.06 examples/s]
