### 1. read data

In [None]:
import pandas as pd
import numpy as np

from google.colab import drive
drive.mount('/content/drive/')

train_gold_annotations = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Train_2024/gold_annotations_complete.tsv', sep='\t', header = None)
train_tweets = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Train_2024/tweets.tsv', sep='\t', header = None)

test_spans_norm = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Dev_2024/gold_annotations_complete.tsv', sep='\t',
                               header = None)
test_gold_annotations_for_evaluation = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Dev_2024/gold_annotations_for_evaluation.tsv', sep='\t',
                               header = None)
test_tweets = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Dev_2024/tweets.tsv', sep='\t',
                               header = None)

# norms.tsv are the submission file for the dev set. Each row of this file contains tweeter ID and all the ptIDs from that tweet.
sample_submission = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Sample_submissions/sample_submission.tsv', sep='\t',
                               header = None)

# seen_concepts.txt contains all the unique preferred terms IDs from train set.
unique_id = pd.read_csv("/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Dev_2024/seen_concepts.txt",
                        sep=",", header = None)

# llt.asc is the MedDRA file where we used to annotate adverse drug events. The first three columns contain information about lower lever term id, text, and preferred term id, respectively.
class Meddra(object):
    '''Basic Meddra Entity object'''
    def __init__(self, ptid, lltid, text):
        self.ptid = ptid
        self.lltid = lltid
        self.text = text

def get_meddra_dict(meddra_llt):
    """load corpus data and write resolution files"""
    pt_dict, llt_dict = {}, {}
    for line in open(meddra_llt, 'r'):
        elems = line.split("$")
        if len(elems) > 2:
            ptid, lltid, text = elems[2], elems[0], elems[1]
            entry = Meddra(ptid, lltid, text)
            if ptid == lltid:
                pt_dict[ptid] = entry
            llt_dict[lltid] = entry
    return pt_dict, llt_dict

pt_dict, llt_dict = get_meddra_dict('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/Resource/llt.asc')
pt_data = [{'PTID': meddra_obj.ptid, 'LLTID': meddra_obj.lltid, 'Text': meddra_obj.text} for ptid, meddra_obj in pt_dict.items()]
llt_data = [{'LLTID': meddra_obj.lltid, 'PTID': meddra_obj.ptid, 'Text': meddra_obj.text} for lltid, meddra_obj in llt_dict.items()]
pt_df = pd.DataFrame(pt_data)
llt_df = pd.DataFrame(llt_data)

Mounted at /content/drive/


## 2. Data cleaning and processing


In [None]:
train_gold_annotations.columns = ['ID', 'ADE', 'Start', 'End', 'Text', 'Code']
test_gold_annotations_for_evaluation.columns = ['ID', 'Text', 'Code']

In [None]:
llt_to_pt_mapping = pd.Series(llt_df.PTID.values,index=llt_df.LLTID.astype(str)).to_dict()
train_gold_annotations['Code'] = train_gold_annotations['Code'].map(llt_to_pt_mapping) # 1711
test_gold_annotations_for_evaluation['Code'] = test_gold_annotations_for_evaluation['Code'].astype(str).map(llt_to_pt_mapping) # 87

In [None]:
train_gold_annotations

Unnamed: 0,ID,ADE,Start,End,Text,Code
0,SMM4H2022yW4mXhmCE9gh1B7b,ADE,28,37,allergies,10013700
1,SMM4H2022afMxPjtHKyItF0n3,ADE,31,46,HURT YOUR Liver,10067125
2,SMM4H2022dc1lWHnZeBy441B8,ADE,48,50,AD,10003736
3,SMM4H2022dc1lWHnZeBy441B8,ADE,88,93,focus,10013496
4,SMM4H2022ExV2GXXaqhGjJNFM,ADE,11,15,died,10011906
...,...,...,...,...,...,...
1706,SMM4H2022Hj9Q5uULQp9gmUL3,ADE,48,54,orgasm,10002652
1707,SMM4H2022BMwmZ6uAtOiMe6nn,ADE,91,116,never have another orgasm,10002652
1708,SMM4H2022ZfPkntGTjmTHbUOc,ADE,65,69,coma,10041349
1709,SMM4H2022otIg7pDmo1eeihoT,ADE,72,91,gain so much weight,10047899


In [None]:
test_gold_annotations_for_evaluation
df_test = test_gold_annotations_for_evaluation[["Text","Code"]]
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df_test['Label'] = le.fit_transform(df_test['Code'])
# df_test = df_test[["Text","Label"]]

In [None]:
test_gold_annotations_for_evaluation
test_gold_annotations_for_evaluation.rename(columns={'Code': 'PTID'}, inplace=True)
df_test = test_gold_annotations_for_evaluation[["Text","PTID"]]
df_test

Unnamed: 0,Text,PTID
0,nerves,10052897
1,muscle spasms,10028334
2,gaining,10047899
3,gain like 50 pounds,10047899
4,frontal headache,10019211
...,...,...
82,cotton mouth,10013781
83,couldn't fall asleep,10022035
84,OCD,10029898
85,addicted,10013663


In [None]:
# df_test['Code_new'] = le.inverse_transform(df_test['Label'])
# df_test

In [None]:
# prompt: rename train_gold_annotations code to be  PTID train_gold_annotations select text and code and add llt_df select Text and PTID
# the new df called df_training_all with two columns of Text and PTID
train_gold_annotations.rename(columns={'Code': 'PTID'}, inplace=True)
df_training_all = pd.concat([train_gold_annotations[['Text', 'PTID']], llt_df[['Text', 'PTID']]], ignore_index=True)
# df_training_all.to_csv('df_training_all.csv', index=False)
# df_training_all
df_train = df_training_all[["Text","PTID"]]
df_train = df_train.dropna(subset=['PTID'])

In [None]:
df_train

Unnamed: 0,Text,PTID
0,allergies,10013700
1,HURT YOUR Liver,10067125
2,AD,10003736
3,focus,10013496
4,died,10011906
...,...,...
81213,Gallbladder removal,10008611
81214,Herpetic keratouveitis,10062004
81215,Muscular back pain,10003988
81216,Computerized tomogram spine,10081777


In [None]:
# prompt: unique count of PTID in df_train

unique_ptid_count = df_train['PTID'].nunique()
print(f"Unique PTID count: {unique_ptid_count}")


Unique PTID count: 23389


In [None]:
# # prompt: Using dataframe df_training_all: want to make label numerical class
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# df_training_all['Label'] = le.fit_transform(df_training_all['PTID'])
# training_data = df_training_all[["Text","Label"]]
# training_data

In [None]:
# # To reverse the encoding and retrieve original labels
# training_data['Label_back'] = le.inverse_transform(df_training_all['Label'])
# # Display the dataframe to verify the changes
# training_data

In [None]:
unique_ptid_count = llt_df['PTID'].nunique()
print(f"Unique PTID count: {unique_ptid_count}")

Unique PTID count: 23389


In [None]:
llt_df[llt_df['PTID'] == "10011921"]

Unnamed: 0,LLTID,PTID,Text


In [None]:
df_train[df_train['PTID'] == "10013573"]

Unnamed: 0,Text,PTID,encoded_PTID
112,dizzy,10013573,2814
153,dizzy,10013573,2814
578,DIZZING,10013573,2814
706,dizziness,10013573,2814
821,dizzy,10013573,2814
844,dizzy,10013573,2814
845,lightheadeded,10013573,2814
966,dizzy,10013573,2814
1028,dizzy,10013573,2814
1032,lightheaded,10013573,2814


## Use a model to train and to predict

In [None]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

In [None]:
model_name = "dmis-lab/biobert-v1.1"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=23389)  # Adjust num_labels to match the number of unique labels

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize the encoder
label_encoder = LabelEncoder()

# Fit the encoder on the training data
df_train['encoded_PTID'] = label_encoder.fit_transform(df_train['PTID'])

# Transform the test data using the same encoder
df_test['encoded_PTID'] = label_encoder.transform(df_test['PTID'])

# Saving the encoder to disk for later use during inference
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [None]:
# Split data into training and validation sets
train_texts = df_train['Text'].values
train_labels = df_train['encoded_PTID'].values
val_texts = df_test['Text'].values
val_labels = df_test['encoded_PTID'].values

In [None]:
# Tokenize the data
def tokenize_data(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

# Tokenize the training and validation data
train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)

In [None]:
# Convert labels to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Create a TensorDataset for both training and validation sets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define number of training epochs
epochs = 8

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # Move batch to device
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Reset the gradient after each batch
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update parameters and take a step using the computed gradient
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")



Epoch 1, Loss: 5.829444418278458
Epoch 2, Loss: 4.666947816708249
Epoch 3, Loss: 3.8096489133340814
Epoch 4, Loss: 3.1865171073975
Epoch 5, Loss: 2.6857368718933254
Epoch 6, Loss: 2.323192537999045
Epoch 7, Loss: 2.018549897620498
Epoch 8, Loss: 1.7672935557702654


###evaluate

In [None]:
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model.eval()
predictions, true_labels = [], []

for batch in val_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs.logits
    pred_labels = logits.argmax(dim=1)  # Use PyTorch's argmax directly
    predictions.extend(pred_labels.detach().cpu().numpy())  # Move to CPU and convert to NumPy
    true_labels.extend(b_labels.detach().cpu().numpy())  # Move to CPU and convert to NumPy

accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')

print(f"Validation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Validation Metrics:
Accuracy: 0.6897
Precision: 0.4948
Recall: 0.5123
F1 Score: 0.4963


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
model.save_pretrained('./biobert_for_meddra')
tokenizer.save_pretrained('./biobert_for_meddra')

('./biobert_for_meddra/tokenizer_config.json',
 './biobert_for_meddra/special_tokens_map.json',
 './biobert_for_meddra/vocab.txt',
 './biobert_for_meddra/added_tokens.json')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import numpy as np
import joblib

# Load your pre-trained model and tokenizer
model_path = './biobert_for_meddra'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Load the label encoder
label_encoder = joblib.load('label_encoder.pkl')

# Load external data
train_tweets = pd.read_csv('/content/drive/MyDrive/LHS 712_Group_Task1/SMM4H-Task1/GPT_results_for_ADRtext_1by1_new.csv')
test_texts = train_tweets['ADRtexts']  # Assuming the correct column name is 'Text'
tweet_ids = train_tweets['TweetID']  # Assuming there is an 'ID' column

# Prepare the data for prediction
def prepare_data(texts):
    encodings = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="pt")
    return encodings

def predict(texts):
    # Prepare data
    encodings = prepare_data(texts)
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']

    # Set up the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure the model is on the correct device

    # Send inputs to the same device as model
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        # Forward pass, get logits
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Convert logits to predicted class indexes
    predictions = logits.argmax(dim=1)
    predictions = predictions.cpu().numpy()  # Move predictions to CPU

    # Decode class indexes to actual class labels (MEDDRA IDs)
    decoded_predictions = label_encoder.inverse_transform(predictions)
    return decoded_predictions

# Making predictions
predicted_meddra_ids = predict(test_texts)

# Mapping tweet IDs to predicted MEDDRA IDs
results = pd.DataFrame({
    'Tweet ID': tweet_ids,
    'ADRtext': test_texts,
    'Predicted MEDDRA ID': predicted_meddra_ids
})


In [None]:
results

Unnamed: 0,Tweet ID,ADRtext,Predicted MEDDRA ID
0,SMM4H2024MIGD1X8bP8fcNmo,[panic attack],10033664
1,SMM4H2024h7F6jxDd1ImBVcO,[make me dizzy],10013573
2,SMM4H20248mtEiEmGomSl9un,No adverse drug effects detected.,10067482
3,SMM4H2024dWqY183jjokvCO1,[extra shattered],10021588
4,SMM4H2024dWqY183jjokvCO1,[can't keep my eyes open],10041349
...,...,...,...
1017,SMM4H2024KcZPiaCotLLuuPQ,[headaches],10019211
1018,SMM4H2024KcZPiaCotLLuuPQ,[generally feel],10025482
1019,SMM4H202408DS8zgsUbIMUBu,[gain weight],10047899
1020,SMM4H2024mIAQr6RPpsKDVBQ,[get the noises when I'm tired],10022437


In [None]:
results['ADRtext'] = results['ADRtext'].str.replace(r'\[|\]', '', regex=True)
results = results[results['ADRtext'] != "No adverse drug effects detected."]
results = results[results['ADRtext'] != "No adverse drug effects mentioned."]
results = results[results['ADRtext'] != "No adverse drug effects detected in the text."]
results = results[results['ADRtext'] != "No adverse drug effects mentioned in square brackets."]

In [None]:
results

Unnamed: 0,Tweet ID,ADRtext,Predicted MEDDRA ID
0,SMM4H2024MIGD1X8bP8fcNmo,panic attack,10033664
1,SMM4H2024h7F6jxDd1ImBVcO,make me dizzy,10013573
3,SMM4H2024dWqY183jjokvCO1,extra shattered,10021588
4,SMM4H2024dWqY183jjokvCO1,can't keep my eyes open,10041349
5,SMM4H2024dWqY183jjokvCO1,fatigue,10016256
...,...,...,...
1016,SMM4H2024KcZPiaCotLLuuPQ,bad stomach,10013946
1017,SMM4H2024KcZPiaCotLLuuPQ,headaches,10019211
1018,SMM4H2024KcZPiaCotLLuuPQ,generally feel,10025482
1019,SMM4H202408DS8zgsUbIMUBu,gain weight,10047899


In [None]:
# prompt: Using dataframe results: save it as tsv without any header

results.to_csv('prediction_4_21.tsv', header=False, sep='\t', index=False)
