## Train the model using the SimCSE framework

In [None]:
# install SimCSE required dependencies
!pip install simcse
!pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
!pip install rdflib
!git clone https://github.com/princeton-nlp/SimCSE simcse
!pip install -r /content/simcse/requirements.txt
!pip install dill==0.3.2
!pip install transformers
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from simcse import SimCSE
from tqdm import tqdm

import pandas as pd

import random
import csv
import torch

from datasets import Dataset, load_dataset, Split

seed = 7631
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def get_seen_unseen_split(train_df, test_df, label_col):
    seen_labels = set(train_df[label_col])    
    seen = test_df.filter(lambda x: x[label_col] in seen_labels)
    unseen = test_df.filter(lambda x: x[label_col] not in seen_labels)
    return seen, unseen

In [None]:
smm4h20 = load_dataset('KevinSpaghetti/smm4h20')
cadec = load_dataset('KevinSpaghetti/cadec')
all_pts = load_dataset('KevinSpaghetti/all_pts', split=Split.ALL)
pt_vocab = dict(zip(all_pts['term'], all_pts['label']))
index_to_label = dict(zip(all_pts['label'], all_pts['term']))
print(len(pt_vocab))



  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/2 [00:00<?, ?it/s]



24571


In [None]:
from collections import defaultdict

class AutoAugmentDataset(Dataset):
    '''
    A dataset that uses the labels of the classes to construct the 
    (anchor, positive) pairs used in contrastive learning.
    Can also return the (anchor, positive, negative) pair.
    Returns a dict that is the pair with the required keys:
    { 
        'label': ..., 
        'anchor': ..., 
        'positive': ..., 
        'label_negative': ...,
        'negative': ...,  }
    '''

    def __init__(self, examples, labels, return_negative = True, return_labels = True):
        """ Inits the dataset from a pandas dataframe, positives will be drawn
        from positives with the same anchor value, negatives will be drawn from 
        examples with other anchor values

        Args:
            examples (Sequence): the values
            anchors (Sequence): the example column labels
            return_negative (bool): whether to return negatives in the pair
            return_labels (bool): whether to return the label associated with the positive and negative examples            
        """
       
        self.examples = list(examples)
        self.labels = list(labels)

        self.return_negative = return_negative
        self.return_labels = return_labels

        # to speed up positive search collect indices in examples list
        # speeds up positive and negative sampling 
        self._positive_indices_for_label = defaultdict(list)
        for index, (positive, label) in enumerate(zip(self.examples, self.labels)):
            self._positive_indices_for_label[label].append(index)

    def __len__(self):
        return len(self.examples)

    def get_positive(self, anchor):
        '''Get positive for an anchor'''
        pos_idx = random.choice(self._positive_indices_for_label[anchor])
        return self.examples[pos_idx]

    def get_negative(self, anchor):
        '''Get a negative example for an anchor, also returns the negative_anchor'''
        # Rejection sampling for negative sampling
        neg_idx = random.randrange(0, len(self))
        while neg_idx in self.positive_indices_for_label[anchor]:
            neg_idx = random.randrange(0, len(self))
        
        return (self.labels[neg_idx], self.examples[neg_idx]) 

    def __getitem__(self, idx):
        anchor, label = self.examples[idx], self.labels[idx]
        
        result = {
            'anchor': anchor,
            'positive': self.get_positive(label)
        }
        
        if self.return_labels: result['label'] = label

        if not self.return_negative:
            return result

        neg_label, neg_example = self.get_negative(anchor)

        result['negative'] = neg_example
        if self.return_labels: result['label_negative'] = neg_label

        return result


In [None]:
smm4h20_contrastive = AutoAugmentDataset(smm4h20['train']['ade'], smm4h20['train']['term_PT'], return_negative=False)
cadec_contrastive = AutoAugmentDataset(cadec['train']['ade'], cadec['train']['term_PT'], return_negative=False)

In [None]:
def create_dataset(df):
  anchors = []
  positives = []
  for i in range(0, len(df)):
    ex = df[i]
    anchors.append(ex['anchor'])
    positives.append(ex['positive'])
  return pd.DataFrame({'anchor': anchors, 'positive': positives})

create_dataset(smm4h20_contrastive).to_csv('./content/smm4h20.csv', header=True, index=False)
create_dataset(cadec_contrastive).to_csv('./content/cadec.csv', header=True, index=False)

In [None]:
%cd ./content/simcse

/content/simcse


In [None]:
!python train.py \
    --model_name_or_path allenai/scibert_scivocab_uncased \
    --train_file ./content/cadec.csv \
    --output_dir ./content/model/cadec-result \
    --num_train_epochs 32 \
    --per_device_train_batch_size 128 \
    --learning_rate 5e-5 \
    --max_seq_length 32 \
    --pooler_type cls \
    --overwrite_output_dir \
    --temp 0.1 \
    --do_train
!python train.py \
    --model_name_or_path allenai/scibert_scivocab_uncased \
    --train_file ./content/smm4h20.csv \
    --output_dir ./content/model/smm4h20-result \
    --num_train_epochs 32 \
    --per_device_train_batch_size 128 \
    --learning_rate 5e-5 \
    --max_seq_length 32 \
    --pooler_type cls \
    --overwrite_output_dir \
    --temp 0.1 \
    --do_train

Downloading and preparing dataset csv/default to /content/simcse/./data/csv/default-327bd6d370818323/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...
Downloading data files: 100% 1/1 [00:00<00:00, 7049.25it/s]
Extracting data files: 100% 1/1 [00:00<00:00, 1239.45it/s]
Dataset csv downloaded and prepared to /content/simcse/./data/csv/default-327bd6d370818323/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.
100% 1/1 [00:00<00:00, 887.12it/s]
[INFO|file_utils.py:1272] 2022-10-31 12:03:29,966 >> https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp0tq1v7ob
Downloading: 100% 385/385 [00:00<00:00, 325kB/s]
[INFO|file_utils.py:1276] 2022-10-31 12:03:30,327 >> storing https://huggingface.co/allenai/scibert_scivocab_uncased/resolve/main/config.json in cache at /root/.cache/hugg

In [None]:
smm4h20_model = SimCSE("./content/model/smm4h20-result")
smm4h20_model.build_index(all_pts['term'], device='cuda')

cadec_model = SimCSE("./content/model/cadec-result")
cadec_model.build_index(all_pts['term'], device='cuda')

Some weights of BertModel were not initialized from the model checkpoint at /content/model/smm4h20-result and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 384/384 [00:16<00:00, 23.55it/s]
Some weights of BertModel were not initialized from the model checkpoint at /content/model/cadec-result and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 384/384 [00:16<00:00, 22.66it/s]


In [None]:
%%capture
seen, unseen = get_seen_unseen_split(smm4h20['train'], smm4h20['test'], label_col='term_PT')

# change to seen to get seen split results
smm4h20_results = smm4h20_model.search(unseen['ade'], top_k=5, device='cuda', threshold=0.01)
real_meddras = unseen['term_PT']

to_classify = len(unseen)
correctly_classified = 0
correct_meddra_in_top_5 = 0
wrongly_classified = 0

predicted_meddras = []
for result_list in smm4h20_results:
    res = []
    for (meddra, _) in result_list:
        res.append(meddra)
    predicted_meddras.append(res)

for (real_meddra, predicted_meddra_options) in tqdm(zip(real_meddras, predicted_meddras)):
    if predicted_meddra_options:
        if predicted_meddra_options[0] == real_meddra:
            correctly_classified += 1
        if real_meddra in predicted_meddra_options:
            correct_meddra_in_top_5 += 1
        else:
            wrongly_classified += 1
    else:
        wrongly_classified += 1




In [None]:
print(to_classify)
print(f"correct%: {correctly_classified / to_classify}, top5% {correct_meddra_in_top_5 / to_classify}")

32
correct%: 0.28125, top5% 0.34375


In [None]:
%%capture
seen, unseen = get_seen_unseen_split(cadec['train'], cadec['test'], label_col='term_PT')
# change to seen to get seen split results
cadec_results = cadec_model.search(unseen['ade'], top_k=5, device='cuda', threshold=0.01)
real_meddras = unseen['term_PT']

to_classify = len(unseen)
correctly_classified = 0
correct_meddra_in_top_5 = 0
wrongly_classified = 0

predicted_meddras = []
for result_list in cadec_results:
    res = []
    for (meddra, _) in result_list:
        res.append(meddra)
    predicted_meddras.append(res)

for (real_meddra, predicted_meddra_options) in tqdm(zip(real_meddras, predicted_meddras)):
    if predicted_meddra_options:
        if predicted_meddra_options[0] == real_meddra:
            correctly_classified += 1
        if real_meddra in predicted_meddra_options:
            correct_meddra_in_top_5 += 1
        else:
            wrongly_classified += 1
    else:
        wrongly_classified += 1




In [None]:
print(to_classify)
print(f"correct%: {correctly_classified / to_classify}, top5% {correct_meddra_in_top_5 / to_classify}")

38
correct%: 0.2894736842105263, top5% 0.47368421052631576
