In [19]:
# pip install --upgrade datasets

In [20]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/NLP/'
# base_path = ''

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [22]:
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score

from transformers import EarlyStoppingCallback

SEED = 42

In [23]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    print(df)
    texts = df.text.values
    labels = df.label.values
    return texts, labels

In [24]:
texts, labels = load_data(base_path+"dataset/cleaned/paragraph_marking/merge.csv")
train_texts, temp_texts, train_labels, temp_labels = train_test_split(texts, labels, test_size=0.2, random_state=SEED)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=SEED)

                                                   text  label
0     Early-Onset Familial BRIEF COMMUNICATIONS dire...      0
1     Methods: Two sisters had early-onset parkinson...      0
2     Results: No mutations were found in the genes ...      0
3                          Ann Neurol 2006;59:859 – 862      0
4     Mitochondrial involvement in the pathogenesis ...      0
...                                                 ...    ...
4193  Lehmann D, Kornhuber ME, Clajus C, Alston CL, ...      0
4194  Ferrari G, Lamantea E, Donati A, Filosto M, Br...      0
4195  Laforet P, Lombes A, Eymard B, Danan C, Cheval...      0
4196  Publisher’s Note: All claims expressed in this...      0
4197  Copyright © 2022 Lang-Orsini and Gonzalez-Pere...      0

[4198 rows x 2 columns]


In [25]:
train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_data = Dataset.from_dict({"text": val_texts, "label": val_labels})
test_data = Dataset.from_dict({"text": test_texts, "label": test_labels})

In [26]:
model_name = "prajjwal1/bert-tiny"
# model_name = "bert-base-cased"
# model_name = "allenai/scibert_scivocab_uncased"
# model_name = "dmis-lab/biobert-base-cased-v1.1"
# model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

In [28]:
train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/3358 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

Map:   0%|          | 0/420 [00:00<?, ? examples/s]

In [29]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [30]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {"accuracy": acc,"f1": f1}

In [31]:
training_args = TrainingArguments(
    output_dir=base_path+"./model/SC",
    num_train_epochs=15,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=SEED,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3701,0.29568,0.869048,0.587434
2,0.2554,0.228721,0.914286,0.822527
3,0.156,0.321966,0.902381,0.793463
4,0.1037,0.420134,0.864286,0.776744


TrainOutput(global_step=840, training_loss=0.2212991510118757, metrics={'train_runtime': 59.0677, 'train_samples_per_second': 852.75, 'train_steps_per_second': 53.329, 'total_flos': 17065181921280.0, 'train_loss': 0.2212991510118757, 'epoch': 4.0})

In [33]:
outputs = trainer.predict(test_dataset)
predictions_bert = np.argmax(outputs.predictions, axis=1)

In [34]:
tn, fp, fn, tp = confusion_matrix(test_dataset["label"], predictions_bert).ravel()
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

accuracy = accuracy_score(test_dataset["label"], predictions_bert)
print(f"Accuracy: {accuracy:.4f}")

precision = precision_score(test_dataset["label"], predictions_bert)
print(f"Precision: {precision:.4f}")

recall = recall_score(test_dataset["label"], predictions_bert)
print(f"Recall: {recall:.4f}")

f1 = f1_score(test_dataset["label"], predictions_bert)
print(f"F1-Score: {f1:.4f}")

Confusion Matrix: TN=336, FP=6, FN=35, TP=43
Accuracy: 0.9024
Precision: 0.8776
Recall: 0.5513
F1-Score: 0.6772


In [35]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

probabilities = softmax(outputs.predictions)
v_list = [0.5,0.4,0.3,0.2,0.1,0.05,0.03,0.01]
acc = []
pre = []
rec = []
f1_list = []
for i in v_list:
    pre_label = []
    for j in probabilities:
        if j[1]>i:
            pre_label.append(1)
        else:
            pre_label.append(0)

    accuracy = accuracy_score(test_dataset["label"], pre_label)
    acc.append(accuracy)

    precision = precision_score(test_dataset["label"], pre_label)
    pre.append(precision)

    recall = recall_score(test_dataset["label"], pre_label)
    rec.append(recall)

    f1 = f1_score(test_dataset["label"], pre_label)
    f1_list.append(f1)

print(acc)
print(pre)
print(rec)
print(f1_list)

[0.9023809523809524, 0.9023809523809524, 0.9095238095238095, 0.9023809523809524, 0.9023809523809524, 0.8642857142857143, 0.7833333333333333, 0.18571428571428572]
[0.8775510204081632, 0.8490566037735849, 0.8448275862068966, 0.7681159420289855, 0.7126436781609196, 0.591304347826087, 0.45806451612903226, 0.18571428571428572]
[0.5512820512820513, 0.5769230769230769, 0.6282051282051282, 0.6794871794871795, 0.7948717948717948, 0.8717948717948718, 0.9102564102564102, 1.0]
[0.6771653543307087, 0.6870229007633588, 0.7205882352941176, 0.7210884353741497, 0.7515151515151515, 0.7046632124352331, 0.6094420600858369, 0.3132530120481928]


In [36]:
pre_res = []
for k in probabilities:
    if k[1]>0.1:
        pre_res.append(1)
    else:
        pre_res.append(0)

tn, fp, fn, tp = confusion_matrix(test_dataset["label"], pre_res).ravel()
print(f"Confusion Matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

accuracy = accuracy_score(test_dataset["label"], pre_res)
print(f"Accuracy: {accuracy:.4f}")

precision = precision_score(test_dataset["label"], pre_res)
print(f"Precision: {precision:.4f}")

recall = recall_score(test_dataset["label"], pre_res)
print(f"Recall: {recall:.4f}")

f1 = f1_score(test_dataset["label"], pre_res)
print(f"F1-Score: {f1:.4f}")

Confusion Matrix: TN=317, FP=25, FN=16, TP=62
Accuracy: 0.9024
Precision: 0.7126
Recall: 0.7949
F1-Score: 0.7515
