In [50]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from datasets import Dataset


## Load Dataset

In [12]:
data_path = "../data/jujutsu.jsonl"
df = pd.read_json(data_path, lines=True)
df.head()



Unnamed: 0,jutsu_name,jutsu_type,jutsu_description
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...
1,Amenotejikara: Chidori,"Kekkei Genkai, Ninjutsu, Dōjutsu, Shurikenjutsu",Sasuke throws some lightning infused kunai at ...
2,Amenosubaruboshinomikoto: Ryūgū,Ninjutsu,The user extends their arm outward at the targ...
3,Amplification Summoning Technique,"Ninjutsu, Space–Time Ninjutsu",This is a summoning technique that bestows a s...
4,Amenotejikara: Instant,"Kekkei Genkai, Ninjutsu, Dōjutsu, Kenjutsu, Sp...",Sasuke attacks his opponent with sword slashes...


In [13]:
def simplify_jutsu(jutsu):
    if "Genjutsu" in jutsu:
        return "Genjutsu"
    if "Ninjutsu" in jutsu:
        return "Ninjutsu"
    if "Taijutsu" in jutsu:
        return "Taijutsu"
    






In [14]:
df["jutsu_type_simplified"] = df["jutsu_type"].apply(simplify_jutsu)


In [15]:
df.head()


Unnamed: 0,jutsu_name,jutsu_type,jutsu_description,jutsu_type_simplified
0,10 Hit Combo,Taijutsu,Lars punches the opponent before striking them...,Taijutsu
1,Amenotejikara: Chidori,"Kekkei Genkai, Ninjutsu, Dōjutsu, Shurikenjutsu",Sasuke throws some lightning infused kunai at ...,Ninjutsu
2,Amenosubaruboshinomikoto: Ryūgū,Ninjutsu,The user extends their arm outward at the targ...,Ninjutsu
3,Amplification Summoning Technique,"Ninjutsu, Space–Time Ninjutsu",This is a summoning technique that bestows a s...,Ninjutsu
4,Amenotejikara: Instant,"Kekkei Genkai, Ninjutsu, Dōjutsu, Kenjutsu, Sp...",Sasuke attacks his opponent with sword slashes...,Ninjutsu


In [16]:
df["jutsu_type_simplified"].value_counts()


jutsu_type_simplified
Ninjutsu    2263
Taijutsu     398
Genjutsu     101
Name: count, dtype: int64

In [18]:
df['text'] = df['jutsu_name'] + " " + df['jutsu_description']
df['jutsus'] = df['jutsu_type_simplified']
df =df[["text", "jutsus"]]
df =df.dropna()


In [19]:
df.head()

Unnamed: 0,text,jutsus
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu
1,Amenotejikara: Chidori Sasuke throws some ligh...,Ninjutsu
2,Amenosubaruboshinomikoto: Ryūgū The user exten...,Ninjutsu
3,Amplification Summoning Technique This is a su...,Ninjutsu
4,Amenotejikara: Instant Sasuke attacks his oppo...,Ninjutsu


In [29]:
from bs4 import BeautifulSoup
class Cleaner():
    def __init__(self):
        pass
    def put_line_breaks(self, text):
        return text.replace("<\p>", "<\p>\n")
    def remove_html_tags(self, text):
        clean_text = BeautifulSoup(text, "lxml").text
        return clean_text
    def clean(self, text):
        text = self.put_line_breaks(text)
        text = self.remove_html_tags(text)
        text = text.strip()
        return text
        



In [30]:
text_column_name = 'text'
label_column_name = 'jutsus'

In [31]:
cleaner = Cleaner()
df['text_cleaned'] = df[text_column_name].apply(cleaner.clean)


  clean_text = BeautifulSoup(text, "lxml").text


In [32]:
df.head()


Unnamed: 0,text,jutsus,text_cleaned
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu,10 Hit Combo Lars punches the opponent before ...
1,Amenotejikara: Chidori Sasuke throws some ligh...,Ninjutsu,Amenotejikara: Chidori Sasuke throws some ligh...
2,Amenosubaruboshinomikoto: Ryūgū The user exten...,Ninjutsu,Amenosubaruboshinomikoto: Ryūgū The user exten...
3,Amplification Summoning Technique This is a su...,Ninjutsu,Amplification Summoning Technique This is a su...
4,Amenotejikara: Instant Sasuke attacks his oppo...,Ninjutsu,Amenotejikara: Instant Sasuke attacks his oppo...


### Encoding the labels for jutsus column


In [35]:
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())









In [36]:
label_dict = {index:label_name for index, label_name in enumerate(le.__dict__['classes_'].tolist())}


In [37]:
label_dict


{0: 'Genjutsu', 1: 'Ninjutsu', 2: 'Taijutsu'}

In [38]:
df['label'] = le.transform(df[label_column_name].tolist())


In [39]:
df.head()


Unnamed: 0,text,jutsus,text_cleaned,label
0,10 Hit Combo Lars punches the opponent before ...,Taijutsu,10 Hit Combo Lars punches the opponent before ...,2
1,Amenotejikara: Chidori Sasuke throws some ligh...,Ninjutsu,Amenotejikara: Chidori Sasuke throws some ligh...,1
2,Amenosubaruboshinomikoto: Ryūgū The user exten...,Ninjutsu,Amenosubaruboshinomikoto: Ryūgū The user exten...,1
3,Amplification Summoning Technique This is a su...,Ninjutsu,Amplification Summoning Technique This is a su...,1
4,Amenotejikara: Instant Sasuke attacks his oppo...,Ninjutsu,Amenotejikara: Instant Sasuke attacks his oppo...,1


In [45]:
### Splitting the data into training and testing data
test_size = 0.2
df_train, df_test = train_test_split(df, 
                                                    test_size=test_size,
                                                    stratify = df['label'],
                                                    )







In [46]:
### Model Building
model_name = "distilbert/distilbert-base-uncased"









In [48]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [49]:
def preprocess_function(tokenizer, examples):
    return tokenizer(examples['text_cleaned'], truncation=True)


In [51]:
# Convert Pandas Dataframe to Hugging Face Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

# Tokenize the dataset
tokenized_train = train_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
tokenized_test = test_dataset.map(lambda examples: preprocess_function(tokenizer, examples), batched=True)
























Map: 100%|██████████| 2209/2209 [00:00<00:00, 9160.93 examples/s]
Map: 100%|██████████| 553/553 [00:00<00:00, 10029.14 examples/s]
