In [None]:
!pip install transformers sentencepiece
!pip install datasets
!pip install fugashi unidic_lite
!pip install demoji neologdn

from google.colab  import drive


import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
import os
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForQuestionAnswering,
    QuestionAnsweringPipeline,
    TrainingArguments,
    Trainer,
    pipeline,
    DefaultDataCollator
)
device = torch.device("cuda:0") if torch.cuda.is_available() else  torch.device("cpu")
os.environ['WANDB_DISABLED'] = 'true'


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForQuestionAnswering.from_pretrained(path)

qap = pipeline('question-answering', model=model,
                    tokenizer=tokenizer,device=device)


In [None]:
def squad_json_to_dataframe(input_file_path, record_path=['data', 'paragraphs', 'qas', 'answers'],dom_sep=False): 
    file = json.loads(open(input_file_path).read())
    #file = create_additional_dataset(file)
    m = pd.json_normalize(file, record_path[:-1])
    r = pd.json_normalize(file, record_path[:-2])
    # combining it into single dataframe
    idx = np.repeat(r['context'].values, r.qas.str.len())
    m['context'] = idx
    m['is_impossible'] = m['answers'].apply(lambda x: not bool(x))
    m['answers'] = m['answers'].apply(lambda x: x[0] if x else {"text":"","answer_start":-1})

    train = pd.DataFrame(columns=["question",	"id","answers",	"is_impossible","context"])
    test = pd.DataFrame(columns=["question",	"id","answers",	"is_impossible","context"])

    train,test = train_test_split(m, test_size = 0.2,random_state=2023)


    return train,test

# Huggingfaceのチュートリアルで提示されているコードで学習用のデータ形式に変換します。
# https://huggingface.co/docs/transformers/tasks/question_answering

max_length = 500  # The maximum length of a feature (question and context)
doc_stride = (
    0  # The authorized overlap between two part of the context when splitting
)

def prepare_train_features(examples):
    #
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    examples['question'] = [q.strip() for q in examples['question']]
    examples['context'] = [c.strip() for c in examples['context']]
    inputs = tokenizer(
        text=examples['question'],
        text_pair=examples['context'],
        truncation='only_second',
        max_length=max_length,
        stride=doc_stride,
        return_offsets_mapping=False,
        padding='max_length',
    )

    offset_mapping = inputs.pop('offset_mapping')
    answers = examples['answers']
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start']
        end_char = answer['answer_start'] + len(answer['text'])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs['start_positions'] = start_positions
    inputs['end_positions'] = end_positions

    return inputs


In [None]:
res = []
from transformers import PreTrainedTokenizerFast

for model_path in model_paths:

    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print("fine tuning on ", model_path)
    train,test = squad_json_to_dataframe("****")
    train_dataset = Dataset.from_pandas(train)
    test_dataset = Dataset.from_pandas(test)

    tokenized_train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        remove_columns=train_dataset.column_names,
        num_proc=3,
    )
    tokenized_test_dataset = test_dataset.map(
        prepare_train_features,
        batched=True,
        remove_columns=test_dataset.column_names,
        num_proc=3,
    )

    data_collator = DefaultDataCollator()

    training_args = TrainingArguments(
        output_dir=f'./outputs/{model_name}/',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        evaluation_strategy="epoch",
        save_strategy ='epoch',
        save_total_limit = 2, # Only last 5 models are saved. Older ones are deleted.
        load_best_model_at_end=True,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    
    pipe = pipeline('question-answering', model=trainer.model,
                    tokenizer=trainer.tokenizer,device=device)


    pred_df = test[['question', 'context', 'answers']].copy()
    pred_df['actual_answer'] = pred_df['answers'].apply(lambda x: x['text'])
    pred_df[['pred_answer',"prob"]] = pred_df.apply(
        predict, axis=1, result_type='expand')
    pred_df = pred_df.drop('answers', axis=1)
    res.append(pred_df)


In [None]:
def preprocess(text):
    text = re.sub(r"https?://[\w/:%#\$&\?\(\)~\.=\+\-]+", "", text)
    text = re.sub(r"[\w\-\._]+@[\w\-\._]+\.[A-Za-z]+", "", text)
    text = re.sub(r"0([0-9]-[0-9]{4}|[0-9]{2}-[0-9]{3}|[0-9]{3}-[0-9]{2}|[0-9]{4}-[0-9])-[0-9]{4}", "", text)
    text = re.sub(r"0[789]0-[0-9]{4}-[0-9]{4}", "", text)
    text = re.sub("\r\n", " ", text)
    text = re.sub("\n", " ", text)
    
    text = demoji.replace(string=text, repl="")
    text = neologdn.normalize(text, tilde="normalize", remove_space=False)  # 半角チルダ -> そのまま、全角チルダ-> 半角チルダ
    return text
