<a href="https://colab.research.google.com/github/KazukiHirata-sun/ai_project_dev_2022/blob/main/section_2/BERT_Fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification of Japanese Sentences with BERT
[Fine-tune BERT's model](https://towardsdatascience.com/what-exactly-happens-when-we-fine-tune-bert-f5dc32885d76) on the Japanese dataset to classify the news.

## Installation of all library we need


In [None]:
!pip install transformers
!pip install nlp
!pip install datasets
!pip install fugashi
!pip install ipadic

## Connecting with Google Drive
Mount our Google Drive using the authorization code.

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [5]:
# Set up your own working folder
workFolder = "/content/drive/MyDrive/bert_nlp/"

## Loading Dataset
Load a [news dataset](https://www.rondhuit.com/download.html#ldcc) stored on Google Drive.



In [None]:
import glob
import os

raw_data_path = workFolder + "text/"

dir_files = os.listdir(path=raw_data_path)
dirs = [f for f in dir_files if os.path.isdir(os.path.join(raw_data_path, f))] 

text_label_data = []
dir_count = 0 
file_count= 0 

for i in range(len(dirs)):
    dir = dirs[i]
    files = glob.glob(raw_data_path + dir + "/*.txt") 
    dir_count += 1

    for file in files:
        if os.path.basename(file) == "LICENSE.txt":
            continue

        with open(file, "r") as f:
            text = f.readlines()[3:]
            text = "".join(text)
            text = text.translate(str.maketrans({"\n":"", "\t":"", "\r":"", "\u3000":""})) 
            text_label_data.append([text, i])

        file_count += 1
        print("\rfiles: " + str(file_count) + " dirs: " + str(dir_count), end="")

## Saving Data
Devide the data into training and test data and save them as csv files to Google Drive.

In [7]:
import csv
from sklearn.model_selection import train_test_split

# Split for training and testing data
news_train, news_test =  train_test_split(text_label_data, shuffle=True)
data_path = workFolder + "data/"

if not os.path.exists(data_path):
    os.makedirs(data_path)

with open(data_path+"news_train.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(news_train)

with open(data_path+"news_test.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(news_test)

## Loading Models and Tokenizers
Load a pre-trained Japanese model and its associated Tokenizer.

In [None]:
from transformers import BertForSequenceClassification, BertJapaneseTokenizer

model_name ='cl-tohoku/bert-base-japanese-whole-word-masking'

sc_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=9)
sc_model.cuda()
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)

## Loading Data Sets
Loads stored news data.

In [None]:
from datasets import load_dataset

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=128)
    
data_path = workFolder + "data/"

train_data = load_dataset("csv", data_files=data_path+"news_train.csv", column_names=["text", "label"], split="train")
train_data = train_data.map(tokenize, batched=True, batch_size=len(train_data))
train_data.set_format("torch", columns=["input_ids", "label"])

test_data = load_dataset("csv", data_files=data_path+"news_test.csv", column_names=["text", "label"], split="train")
test_data = test_data.map(tokenize, batched=True, batch_size=len(test_data))
test_data.set_format("torch", columns=["input_ids", "label"])

## Functions for evaluation
Use `sklearn.metrics()` to define functions for evaluating models.

In [10]:
from sklearn.metrics import accuracy_score

def compute_metrics(result):
    labels = result.label_ids
    preds = result.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
    }

## Setting up a Trainer
Use the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) and [TrainingArguments](https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments) classes to set up a Trainer to train. 


In [11]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 2,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 32,
    warmup_steps = 500, 
    weight_decay = 0.01,
    logging_dir = "./logs",
)

trainer = Trainer(
    model = sc_model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_data,
    eval_dataset = test_data,
)

## Model training
Fine tuning based on the setting.

In [None]:
trainer.train()

## Evaluating Models
The trainer's `evaluate()` method evaluates the model.

In [None]:
trainer.evaluate()

## Save Model
Saves a trained model.

In [None]:
model_path = workFolder + "model/"

if not os.path.exists(model_path):
    os.makedirs(model_path)

trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

## Loading a model
Loads a previously saved model.

In [None]:
loaded_model = BertForSequenceClassification.from_pretrained(model_path)
loaded_model.cuda()
loaded_tokenizer = BertJapaneseTokenizer.from_pretrained(model_path)

## Japanese News Classification
Classify news using the loaded model.

In [None]:
import os
import torch

# Loading the data to be classified.
file = raw_data_path + "/sports-watch/sports-watch-4764756.txt"
with open(file, "r") as f:
    sample_text = f.readlines()[3:]
    sample_text = "".join(sample_text)
    sample_text = sample_text.translate(str.maketrans({"\n":"", "\t":"", "\r":"", "\u3000":""})) 

# # https://www.infoq.com/jp/articles/ai-devops-takeover/?itm_source=articles_about_ai-ml-data-eng&itm_medium=link&itm_campaign=ai-ml-data-eng
# sample_text = "開発者の多くにとって、DevOpsの次に何が来るかを予測することは、ある種の気晴らしになっています。この10年間、私たちは、私たちの業界が急速に変化するのを目の当たりにしてきました。その間には、プログラマの役割も根本から変わってきています。"

max_length = 512
words = loaded_tokenizer.tokenize(sample_text)
word_ids = loaded_tokenizer.convert_tokens_to_ids(words)
word_tensor = torch.tensor([word_ids[:max_length]])

# Prediction
x = word_tensor.cuda()  
y = loaded_model(x)
pred = y[0].argmax(-1) 

# Displaying the results
raw_data_path = workFolder + "/text/"
dir_files = os.listdir(path=raw_data_path)
dirs = [f for f in dir_files if os.path.isdir(os.path.join(raw_data_path, f))]
print("結果は", dirs[pred])