In [None]:
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q  sentencepiece
!pip install -q mecab-python3
!pip install -q fugashi
!pip install -q ipadic
!apt install aptitude swig
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y


In [None]:
import pandas as pd
import torch
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
ai_sentence = pd.read_csv('https://raw.githubusercontent.com/N-OKAMOTO1031/datasets/main/detect_AI_generated_sentences/main_verification/llm_summary.csv')
wikipedia_sentence = pd.read_csv('https://raw.githubusercontent.com/N-OKAMOTO1031/datasets/main/detect_AI_generated_sentences/main_verification/wikipedia_summary.csv')
ai_sentence['label'] = 1
ai_sentence['label_name'] = 'ai_generated_sentence'
ai_sentence = ai_sentence.drop('title', axis=1)

wikipedia_sentence['label'] = 0
wikipedia_sentence['label_name'] = 'human_generated_sentence'
wikipedia_sentence = wikipedia_sentence.drop('title', axis=1)

sentence_df = pd.concat([ai_sentence, wikipedia_sentence]).dropna()

sentences = sentence_df.sentence.values
labels = sentence_df.label.values

In [None]:
from transformers import BertJapaneseTokenizer
# tokenizerを設定
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')

In [None]:
input_ids = []
attention_masks = []

# 1文づつ処理
for sent in tqdm(sentences):
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,
                        max_length =512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )

    # 単語IDを取得
    input_ids.append(encoded_dict['input_ids'])

    # Attention　maskの取得
    attention_masks.append(encoded_dict['attention_mask'])

# リストに入ったtensorを縦方向（dim=0）へ結合
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# tenosor型に変換
labels = torch.tensor(labels)

In [None]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# データセットクラスの作成
dataset = TensorDataset(input_ids, attention_masks, labels)

# 90%地点のIDを取得
val_size = int(0.1 * len(dataset))
test_size = int(0.1 * len(dataset))
train_size = len(dataset) - val_size - test_size

# データセットを分割
train_dataset, val_dataset, test_datasets = random_split(dataset, [train_size, val_size, test_size])

print('訓練データ数：{}'.format(train_size))
print('検証データ数: {} '.format(val_size))
print('testデータ数: {} '.format(test_size))

# データローダーの作成
batch_size = 32

# 訓練データローダー
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset), # ランダムにデータを取得してバッチ化
            batch_size = batch_size
        )

# 検証データローダー
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # 順番にデータを取得してバッチ化
            batch_size = batch_size
        )

# テストデータローダー
test_dataloader = DataLoader(
            test_datasets,
            sampler = SequentialSampler(test_datasets), # 順番にデータを取得してバッチ化
            batch_size = batch_size
        )

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# BertForSequenceClassification 学習済みモデルのロード
model = BertForSequenceClassification.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
    # use_auth_token = True
)

# モデルをGPUへ転送
if device == 'cuda':
  model.cuda()

In [None]:
# 最適化手法の設定
optimizer = AdamW(model.parameters(), lr=2e-6)

# 訓練パートの定義
def train(model, dataloader):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        loss= model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels).loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item()
    return train_loss

# テストパートの定義
def validation(model, dataloader):
    model.eval()# 訓練モードをオフ
    val_loss = 0
    with torch.no_grad(): # 勾配を計算しない
        for batch in dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            with torch.no_grad():
                loss = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask,
                                    labels=b_labels).loss
            val_loss += loss.item()
    return val_loss

In [None]:
# 学習の実行
max_epoch = 50
train_loss_ = []
valid_loss_ = []

for epoch in tqdm(range(max_epoch)):
  print(epoch)
  train_ = train(model, train_dataloader)
  valid_ = validation(model, validation_dataloader)
  train_loss_.append(train_)
  valid_loss_.append(valid_)
  print(train_)
  print(valid_)

In [None]:
train_loss_

In [None]:
valid_loss_

In [None]:
import pandas as pd
import numpy as np
model.eval()# 訓練モードをオフ
prediction_result_df = pd.DataFrame()
for batch in test_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)
  with torch.no_grad():
      # 学習済みモデルによる予測結果をpredsで取得
      preds = model(b_input_ids,
                          token_type_ids=None,
                          attention_mask=b_input_mask)
  # pd.dataframeへ変換（GPUに乗っているTensorはgpu->cpu->numpy->dataframeと変換）
  logits_df = pd.DataFrame(preds[0].cpu().numpy(), columns=['logit_0', 'logit_1'])
  ## np.argmaxで大き方の値を取得
  pred_df = pd.DataFrame(np.argmax(preds[0].cpu().numpy(), axis=1), columns=['pred_label'])
  label_df = pd.DataFrame(b_labels.cpu().numpy(), columns=['true_label'])

  accuracy_df = pd.concat([logits_df, pred_df, label_df], axis=1)

  prediction_result_df = pd.concat([prediction_result_df, accuracy_df])

In [None]:
prediction_result_df.loc[prediction_result_df['pred_label'] != prediction_result_df['true_label']]

In [None]:
# モデルの保存
model.save_pretrained('model/model')
model.save_pretrained('model/tokenizer')