In [1]:
!pip install datasets
!pip install accelerate -U
!pip install tensorflow
!pip install transformers
!pip install gdown
!pip install nvidia-cublas-cu12
!pip install torchvision
!pip install torchaudio

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, AdamW, get_scheduler
import torch
import pandas as pd
from datasets import Dataset
import tensorflow as tf
from torch.utils.data import DataLoader
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import gdown
import os

In [3]:
device = torch.device("cuda")

# **Dataset Loading**




In [4]:
df = pd.read_csv('C:\\Users\\Lenovo\\Desktop\\dataset.csv')
df.head()
print(df.shape)

(95, 2)


In [5]:
df['encoded_class'] = df['class'].astype('category').cat.codes

In [6]:
class_counts = df['encoded_class'].value_counts()
# Identify classes with only one sample
single_sample_classes = class_counts[class_counts == 1].index

# Filter out texts belonging to those classes
df = df[~df['encoded_class'].isin(single_sample_classes)]

In [7]:
df['encoded_class'] = df['class'].astype('category').cat.codes
data_texts = df['text'].to_list()
data_labels = df['encoded_class'].to_list()

In [8]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data_texts, data_labels, test_size=0.2, random_state=42, stratify=data_labels
)

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy}

def preprocess_data(texts, labels, tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True)
    dataset = Dataset.from_dict({**encodings, 'labels': labels})
    return dataset

# **Model Training and Saving**

In [11]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['encoded_class'].unique()))
model.to(device)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
fold = 0
best_loss = float('inf')
accuracy_threshold = 0.8
best_model_dir = None
for train_index, val_index in kf.split(train_texts):
    fold += 1
    print(f"Training fold {fold}...")
    print((train_index, val_index))

    train_texts_fold = [train_texts[i] for i in train_index]
    train_labels_fold = [train_labels[i] for i in train_index]
    val_texts_fold = [train_texts[i] for i in val_index]
    val_labels_fold = [train_labels[i] for i in val_index]

    train_dataset = preprocess_data(train_texts_fold, train_labels_fold, tokenizer)
    val_dataset = preprocess_data(val_texts_fold, val_labels_fold, tokenizer)

    training_args = TrainingArguments(
        output_dir=f'./results/fold_{fold}',
        num_train_epochs=30,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=10,
        weight_decay=0.01,
        logging_dir=f'./logs/fold_{fold}',
        evaluation_strategy="steps",
        eval_steps=5,
        save_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        learning_rate=2e-5,
    )

    optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)

    num_training_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs
    scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=training_args.warmup_steps,
        num_training_steps=num_training_steps
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        optimizers=(optimizer, scheduler),
        compute_metrics=compute_metrics
    )

    trainer.train()

    eval_metrics = trainer.evaluate()
    print(f"Fold {fold} evaluation metrics: {eval_metrics}")

    trainer.save_model(f'./results/best_model_fold_{fold}')
    tokenizer.save_pretrained(f'./results/best_model_fold_{fold}')

    if eval_metrics["eval_accuracy"] >= accuracy_threshold:
        if eval_metrics["eval_loss"] < best_loss:
            best_loss = eval_metrics["eval_loss"]
            best_model_dir = f'./results/best_model_fold_{fold}'

if best_model_dir:
    model = DistilBertForSequenceClassification.from_pretrained(best_model_dir)
    model.save_pretrained('./results/best_model')
    tokenizer.save_pretrained('./results/best_model')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training fold 1...
(array([ 1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 16, 17, 19, 20,
       21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
       57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71]), array([ 0,  4, 10, 18, 28, 34, 63, 72]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,2.46121,0.0
10,No log,2.462306,0.125
15,No log,2.450592,0.125
20,No log,2.431507,0.125
25,No log,2.420792,0.125
30,No log,2.411692,0.125
35,No log,2.416142,0.125
40,No log,2.388106,0.125
45,No log,2.343949,0.125
50,No log,2.323193,0.125


Fold 1 evaluation metrics: {'eval_loss': 0.5770097970962524, 'eval_accuracy': 0.875, 'eval_runtime': 0.6774, 'eval_samples_per_second': 11.81, 'eval_steps_per_second': 5.905, 'epoch': 30.0}
Training fold 2...
(array([ 0,  1,  2,  3,  4,  6,  7,  8, 10, 11, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57,
       58, 59, 60, 61, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72]), array([ 5,  9, 12, 22, 31, 45, 55, 65]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,0.274195,0.875
10,No log,0.275831,0.875
15,No log,0.316763,0.875
20,No log,0.332463,0.875
25,No log,0.355489,0.875
30,No log,0.57164,0.625
35,No log,0.41502,0.875
40,No log,0.301809,0.875
45,No log,0.268264,0.875
50,No log,0.287879,0.875


Fold 2 evaluation metrics: {'eval_loss': 0.037419840693473816, 'eval_accuracy': 1.0, 'eval_runtime': 0.5319, 'eval_samples_per_second': 15.041, 'eval_steps_per_second': 7.52, 'epoch': 30.0}
Training fold 3...
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34, 36,
       37, 38, 40, 41, 42, 43, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56,
       58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 71, 72]), array([16, 33, 35, 39, 44, 50, 57, 70]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,0.009552,1.0
10,No log,0.00978,1.0
15,No log,0.009933,1.0
20,No log,0.010474,1.0
25,No log,0.010769,1.0
30,No log,0.009905,1.0
35,No log,0.010309,1.0
40,No log,0.010367,1.0
45,No log,0.009678,1.0
50,No log,0.010146,1.0


Fold 3 evaluation metrics: {'eval_loss': 0.0013402083422988653, 'eval_accuracy': 1.0, 'eval_runtime': 0.5937, 'eval_samples_per_second': 13.474, 'eval_steps_per_second': 6.737, 'epoch': 30.0}
Training fold 4...
(array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 43, 44, 45, 46, 48, 50, 51, 52, 53, 54, 55,
       57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72]), array([ 7, 30, 42, 47, 49, 56, 61]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,0.000346,1.0
10,No log,0.000342,1.0
15,No log,0.000342,1.0
20,No log,0.000342,1.0
25,No log,0.000326,1.0
30,No log,0.000325,1.0
35,No log,0.000317,1.0
40,No log,0.000308,1.0
45,No log,0.000299,1.0
50,No log,0.000284,1.0


Fold 4 evaluation metrics: {'eval_loss': 5.434010017779656e-05, 'eval_accuracy': 1.0, 'eval_runtime': 0.4453, 'eval_samples_per_second': 15.721, 'eval_steps_per_second': 8.983, 'epoch': 30.0}
Training fold 5...
(array([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16, 17, 18,
       20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
       38, 39, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56,
       57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72]), array([ 3, 13, 19, 25, 40, 53, 59]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,0.0001,1.0
10,No log,9.8e-05,1.0
15,No log,9.6e-05,1.0
20,No log,9.4e-05,1.0
25,No log,9.3e-05,1.0
30,No log,9.2e-05,1.0
35,No log,9.1e-05,1.0
40,No log,8.9e-05,1.0
45,No log,8.8e-05,1.0
50,No log,8.7e-05,1.0


Fold 5 evaluation metrics: {'eval_loss': 2.011178912653122e-05, 'eval_accuracy': 1.0, 'eval_runtime': 0.4424, 'eval_samples_per_second': 15.823, 'eval_steps_per_second': 9.042, 'epoch': 30.0}
Training fold 6...
(array([ 0,  1,  2,  3,  4,  5,  7,  9, 10, 11, 12, 13, 14, 15, 16, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37,
       39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
       56, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 71, 72]), array([ 6,  8, 17, 36, 38, 58, 67]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,9e-06,1.0
10,No log,9e-06,1.0
15,No log,9e-06,1.0
20,No log,9e-06,1.0
25,No log,9e-06,1.0
30,No log,9e-06,1.0
35,No log,9e-06,1.0
40,No log,9e-06,1.0
45,No log,9e-06,1.0
50,No log,9e-06,1.0


Fold 6 evaluation metrics: {'eval_loss': 3.9509241105406545e-06, 'eval_accuracy': 1.0, 'eval_runtime': 0.4799, 'eval_samples_per_second': 14.585, 'eval_steps_per_second': 8.334, 'epoch': 30.0}
Training fold 7...
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56,
       57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72]), array([15, 26, 27, 41, 46, 54, 71]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,2e-06,1.0
10,No log,2e-06,1.0
15,No log,2e-06,1.0
20,No log,2e-06,1.0
25,No log,2e-06,1.0
30,No log,2e-06,1.0
35,No log,2e-06,1.0
40,No log,2e-06,1.0
45,No log,2e-06,1.0
50,No log,2e-06,1.0


Fold 7 evaluation metrics: {'eval_loss': 1.1750622661566013e-06, 'eval_accuracy': 1.0, 'eval_runtime': 0.456, 'eval_samples_per_second': 15.352, 'eval_steps_per_second': 8.773, 'epoch': 30.0}
Training fold 8...
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54,
       55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 67, 68, 70, 71, 72]), array([11, 24, 32, 48, 62, 66, 69]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,1e-06,1.0
10,No log,1e-06,1.0
15,No log,1e-06,1.0
20,No log,1e-06,1.0
25,No log,1e-06,1.0
30,No log,1e-06,1.0
35,No log,1e-06,1.0
40,No log,1e-06,1.0
45,No log,1e-06,1.0
50,No log,1e-06,1.0


Fold 8 evaluation metrics: {'eval_loss': 6.301060011537629e-07, 'eval_accuracy': 1.0, 'eval_runtime': 0.4505, 'eval_samples_per_second': 15.539, 'eval_steps_per_second': 8.879, 'epoch': 30.0}
Training fold 9...
(array([ 0,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35,
       36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55,
       56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 69, 70, 71, 72]), array([ 1, 29, 37, 43, 52, 64, 68]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,2e-06,1.0
10,No log,2e-06,1.0
15,No log,2e-06,1.0
20,No log,2e-06,1.0
25,No log,2e-06,1.0
30,No log,2e-06,1.0
35,No log,2e-06,1.0
40,No log,2e-06,1.0
45,No log,2e-06,1.0
50,No log,2e-06,1.0


Fold 9 evaluation metrics: {'eval_loss': 1.3794199276162544e-06, 'eval_accuracy': 1.0, 'eval_runtime': 0.5939, 'eval_samples_per_second': 11.786, 'eval_steps_per_second': 6.735, 'epoch': 30.0}
Training fold 10...
(array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 15, 16, 17, 18,
       19, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56,
       57, 58, 59, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72]), array([ 2, 14, 20, 21, 23, 51, 60]))




Step,Training Loss,Validation Loss,Accuracy
5,No log,1e-06,1.0
10,No log,1e-06,1.0
15,No log,1e-06,1.0
20,No log,1e-06,1.0
25,No log,1e-06,1.0
30,No log,1e-06,1.0
35,No log,1e-06,1.0
40,No log,1e-06,1.0
45,No log,1e-06,1.0
50,No log,1e-06,1.0


Fold 10 evaluation metrics: {'eval_loss': 9.196136261380161e-07, 'eval_accuracy': 1.0, 'eval_runtime': 0.4527, 'eval_samples_per_second': 15.463, 'eval_steps_per_second': 8.836, 'epoch': 30.0}


# **Model Loading**

In [15]:
best_model_dir = './best_model'
tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(best_model_dir)
model_fine_tuned = DistilBertForSequenceClassification.from_pretrained(best_model_dir)
#model_fine_tuned.to(device)

In [None]:
test = df['text'][20]     #index: 0 -> 92
print(test)

predict_input = tokenizer_fine_tuned.encode(
    test,
    truncation = True,
    padding = True,
    return_tensors = 'pt'
)

with torch.no_grad():
    output = model_fine_tuned(predict_input)[0]

prediction_value = torch.argmax(output, dim=1).numpy()[0]
df['class'] = df['class'].astype('category')

classes = df['class'].cat.categories

print(classes[prediction_value])

治験にご参加いただき ありがとうございます 治験への参加は、社会全体の健康増進と医学の進歩につながる 大変素晴らしい献身的な行動であると、私たちは確信しております。 このカードはアストラゼネカ社の治験番号：D081CC00006にご参加いただいた方のためものです。このカードは、一般の方々に 治験参加に関する情報を提供するために設立された独立非営利組織である「Center for Information & Study on Clinical Research Participation（CISCRP）」により作成されました。CISCRPは、治験の参加者募集や治験の実施には関与しておりません。 治験への参加 この治験にご参加いただいたあなたや、ご参加いた だいている多くの方々に、心より感謝申し上げます。 あなたには、アストラゼネカが依頼し、Breast International Group、Frontier Science & Technology Research Foundation、およびNRG Oncologyとの提携で調整された治験に参加いただきました。治験は、世界中の医師、科学者、 治験コーディネーター、参加者の皆様等から成る大規模なチームによって実施されます。あなたの参加のおかげで、 この薬オラパリブが、すでに手術や化学療法を受けた成人の乳がん患者さんに役立つかどうかを調べることができます。 治験が完了すると、結果の概要をwww.trialsummaries.comで閲覧いただけるようになります。詳細については、こちらのウェ ブサイトでご確認ください。登録手続きをしていただくと、治験結果の概要が準備でき次第、電子メールで通知いたします。 インターネットを使えない場合、または結果を印刷したものが必要な場合は、治験スタッフにお知らせください。治験に参加 されるすべての方々が、医学の進歩のために重要な役割を担っていることを誇りに思っていただけるよう願っています。 入手可能になった際、治験結果の要約を受け取るには、 www.trialsummaries.com にアクセスします Dコード D081CC00006を検索 電子メール通知を ご登録ください JA-JP 第1.0版 2023年8月28日 www.astrazeneca.com 
C14_01


# **Model Testing**

In [16]:
def calculate_accuracy(preds, labels):
    return accuracy_score(labels, preds)

In [17]:
test_dataset = preprocess_data(test_texts, test_labels, tokenizer_fine_tuned)
data_collator = DataCollatorWithPadding(tokenizer_fine_tuned)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)
model_fine_tuned.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {k: v for k, v in batch.items() if k != 'labels'}
        labels = batch['labels']
        outputs = model_fine_tuned(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

accuracy = calculate_accuracy(all_preds, all_labels)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8421052631578947
