# importing the libraries

In [7]:
!pip install datasets #installing  modules
!pip install datasets transformers
!pip install matplotlib
!pip install rouge



In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.translate.bleu_score import sentence_bleu
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import default_data_collator
from huggingface_hub import HfApi, HfFolder
from rouge import Rouge
from huggingface_hub import login
from transformers import pipeline
from datasets import load_dataset
import matplotlib.pyplot as plt
from datasets import Dataset
import transformers
import pandas as pd
import numpy
import torch
import re

In [9]:
dataset = load_dataset("pubmed_qa", "pqa_labeled", split="train")
# this turn the dataset into pandas dataframe
df = dataset.to_pandas()

ValueError: Invalid pattern: '**' can only be an entire path component

In [None]:
df.head()

# preprocessing


---
First you want to remove the unnecessary columns
because the model don't need this label(in this case we don't need 'pubid,long_answer and contex')


In [None]:
df.drop(columns=['pubid','long_answer','context'], inplace=True) #removes Unnecessary columns

In [None]:
df.head()

In [None]:
# ranaming the columns for better undrestanding
df = df.rename(columns={'question': 'question', 'final_decision': 'answer'})
df.head()

In [None]:
# 1. removing empty or null recordes
df = df.dropna(subset=['question', 'answer'])

# 2. converting the hole data in lowercase
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()

# 3. Noise removal (e.g. removing special characters or extra spaces)
df['question'] = df['question'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
df['answer'] = df['answer'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

# 4. Remove duplicate rows (if any)
df = df.drop_duplicates(subset=['question', 'answer'])

df.head()

# split Data

In [None]:
# Shuffle the full dataset and split it: 80% for training, remaining for temp_df
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Split the temp_df into validation and test sets (each 10% of total data)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Print the size of each split
print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))

test_df.to_csv("test.csv")

# Ploting Final Answer Distribution

In [None]:
import matplotlib.pyplot as plt

def plot_answer_pie(df, df_name):
    # Count the occurrences of each unique answer
    answer_counts = df['answer'].value_counts()
    # Calculate the total number of answers
    total_answers = answer_counts.sum()

    # Create a pie chart
    plt.figure(figsize=(6, 6))
    plt.pie(
        answer_counts.values,  # Values for the pie chart
        labels=answer_counts.index,  # Labels for each segment
        autopct=lambda p: '{:.1f}%\n({:,.0f})'.format(p, p * total_answers / 100),
        colors=['red', 'green', 'orange'],
        startangle=140  # Start angle for the first segment
    )

    # Add the total number of answers at the center of the pie chart
    plt.text(0, 0, f'Total: {total_answers}', ha='center', va='center', fontsize=10, fontweight='bold')

    plt.title(f'{df_name} Final Answer Distribution')

    # Ensure the pie chart is circular
    plt.axis('equal')

    plt.show()

In [None]:
plot_answer_pie(train_df, 'Train_df')

In [None]:
plot_answer_pie(val_df, 'Validation_df')

In [None]:
plot_answer_pie(test_df, 'Test_df')

# Ploting distribution of Questions Lengths

In [None]:
# Calculate the question length
question_lengths = df["question"].apply(len)

plt.figure(figsize=(8, 5))
plt.hist(question_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("Main DF Question Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
question_lengths = train_df["question"].apply(len)

plt.figure(figsize=(8, 5))
plt.hist(question_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("train_df Question Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
question_lengths = test_df["question"].apply(len)

plt.figure(figsize=(8, 5))
plt.hist(question_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("test_df Question Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
question_lengths = val_df["question"].apply(len)

plt.figure(figsize=(8, 5))
plt.hist(question_lengths, bins=20, color='skyblue', edgecolor='black')
plt.title("val_df Question Lengths")
plt.xlabel("Number of Characters")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# Load Distilbert Model


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Tokenizer that converts words into numbers understandable to the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # For classification with 3 classes (yes, no, maybe)

# DO NOT worry about this warning
#Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
#You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

# FineTuning

In [None]:
# Function to tokenize data and format it for the Trainer
def tokenize_function(examples):
    # Tokenize the 'question' column
    tokenized_inputs = tokenizer(examples["question"], padding="max_length", truncation=True)
    # Rename the 'answer' column to 'labels' to match the Trainer's expectations
    # Ensure 'labels' are numerical, not strings
    tokenized_inputs['labels'] = [
        0 if label == "yes" else (1 if label == "no" else 2)  # Assuming 0: yes, 1: no, 2: maybe
        for label in examples['answer']
    ]
    return tokenized_inputs

In [None]:
# Convert your pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
# Apply the tokenization function to your datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# before you run this cell make sure that you have account [here](https://wandb.ai/authorize?ref=models)

In [None]:
training_args = TrainingArguments(
    output_dir='./',              # Directory to save the results
    eval_strategy="epoch",          # Evaluate at the end of each epoch
    learning_rate=2e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    num_train_epochs=3,                   # Number of epochs
    weight_decay=0.01,                    # Weight decay (regularization)
    logging_dir='./logs',                 # Directory to save logs
    logging_steps=10,                     # Log every 10 steps
)

#Define the Trainer
trainer = Trainer(
    model=model,                          # The model
    args=training_args,                   # The training arguments
    train_dataset=train_dataset,          # The training dataset
    eval_dataset=val_dataset             # The validation dataset

)

#start the fine-tuning process
trainer.train()

In [None]:
# saving model localy
# model.save_pretrained("./saved_model")
# tokenizer.save_pretrained("./saved_model")

# Now we want to save our model in our hugging face acount
To upload our model to the Hugging Face Hub, we first need to log in to our Hugging Face account using the login() function.
Before doing this, make sure you have created an access token from your Hugging Face account. Once the token is generated, copy it and paste it when prompted during the login process.



In [None]:
#login()

In [None]:
#uploading the model and tokenizer

# model.push_to_hub("khojoii/finetuned_distilbert_Medical_Model") #("your_username/your_model_name")
# tokenizer.push_to_hub("khojoii/finetuned_distilbert_Medical_Model")# ("your_username/your_model_name")

# Model evaluation

In [None]:
id2label = {0: 'yes', 1: 'no', 2: 'maybe'}

preds = []
true_labels = []

for _, row in test_df.iterrows():
    q = row["question"]
    true_label = row["answer"]

    inputs = tokenizer(q, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model(**inputs)
        pred = torch.argmax(output.logits, dim=1).item()

    preds.append(id2label[pred])
    true_labels.append(true_label)

print("Accuracy:", accuracy_score(true_labels, preds))
print(classification_report(true_labels, preds))

# The evaluation report:

The model's overall accuracy is about 55%, which can be considered either good or bad depending on the context and expectations. the model performs well on the 'yes' class, but struggles with the 'no' class and completely fails to predict the 'maybe' class. This is likely due to the small proportion of the 'maybe' class in the dataset, leading to an imbalance in the model's predictions.


