In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
from sklearn.metrics import f1_score,precision_score,recall_score
import datasets
import os
import json

In [11]:

annotations = datasets.load_dataset("bigbio/sem_eval_2024_task_2", name="sem_eval_2024_task_2_source")
raw_texts = datasets.load_dataset("bigbio/sem_eval_2024_task_2", name="sem_eval_2024_task_2_ct")['train']

Downloading builder script:   0%|          | 0.00/9.67k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.32M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [12]:
annotations

DatasetDict({
    train: Dataset({
        features: ['id', 'type', 'section_id', 'primary_id', 'secondary_id', 'statement', 'label'],
        num_rows: 1700
    })
    validation: Dataset({
        features: ['id', 'type', 'section_id', 'primary_id', 'secondary_id', 'statement', 'label'],
        num_rows: 200
    })
})

In [13]:
raw_texts

Dataset({
    features: ['clinical_trial_id', 'intervention', 'eligibility', 'results', 'adverse_events'],
    num_rows: 999
})

In [22]:
annotations['train'][0]

{'id': '00466f98-52b8-41f3-9bf1-2edaad950be9',
 'type': 'Comparison',
 'section_id': 'Intervention',
 'primary_id': 'NCT02504424',
 'secondary_id': 'NCT03708393',
 'statement': 'the primary trial and the secondary trial both require the patients to activate the interventions by remote control.',
 'label': 'Contradiction'}

In [14]:
id_to_clinical_trial_record = {}
for instance in raw_texts:
  id_to_clinical_trial_record[instance['clinical_trial_id']] = instance

In [27]:
id_to_clinical_trial_record["NCT00001832"]

{'clinical_trial_id': 'NCT00001832',
 'intervention': ['INTERVENTION 1: ',
  '  Abl Cells IV + Cyclophosphamide 30 mg/kg',
  '  Phase 1 Cyclophosphamide Dose Escalation: Fludarabine 5x25mg/m^2 + Cyclophosphamide 2x30mg/kg + Cells intravenous (IV)',
  'INTERVENTION 2: ',
  '  Abl Cells IV + Cyclophosphamide 60 mg/kg',
  '  Phase 1 Cyclophosphamide Dose Escalation: Fludarabine 5x25mg/m2 + Cyclophosphamide 2x60mg/kg + Cells intravenous (IV)'],
 'eligibility': ['INCLUSION CRITERIA',
  '  Patients must have evaluable metastatic melanoma that is refractory to standard therapy.',
  '  Age greater than or equal to 16 years.',
  '  Patients of both genders must be willing to practice birth control for four months after receiving the preparative regimen.',
  '  Clinical performance status of Eastern Cooperative Oncology Group (ECOG) 0, 1 at entry to the trial and at the time of chemotherapy induction.',
  '  Absolute neutrophil count greater than 1000/mm^3.',
  '  Platelet count greater than 100

In [15]:
results = {}

for instance in annotations["validation"]:
  primary_ctr = id_to_clinical_trial_record[instance["primary_id"]]

  #Retrieve the full section from the primary trial
  primary_section = primary_ctr[instance["section_id"].lower().replace(" ", "_")]

  #Convert a primary section entries to a matrix of TF-IDF features.
  vectorizer = TfidfVectorizer().fit(primary_section)
  X_s = vectorizer.transform([instance["statement"]])
  X_p = vectorizer.transform(primary_section)
  #Compute the cosine similarity between the primary section entries and the statement
  primary_scores = cosine_distances(X_s, X_p)
  #Repeat for the secondary trial
  if instance["type"] == "Comparison":
    secondary_ctr = id_to_clinical_trial_record[instance["secondary_id"]]
    secondary_section = secondary_ctr[instance["section_id"].lower().replace(" ", "_")]
    vectorizer = TfidfVectorizer().fit(secondary_section)
    X_s = vectorizer.transform([instance["statement"]])
    X_p = vectorizer.transform(secondary_section)
    secondary_scores = cosine_distances(X_s, X_p)
    #Combine and average the cosine distances of all entries from the relevant section of the primary and secondary trial
    score = np.average(np.concatenate([primary_scores[0], secondary_scores[0]]))

  else:
    #If the cosine distance is greater than 0.9 the prediction is contradiction
    score = np.average(primary_scores)
  if score > 0.9:
    prediction = "Contradiction"
  else:
    prediction = "Entailment"
  results[instance['id']] = prediction

In [16]:
def main():

    results_pred = []
    gold_labels = []
    for instance in annotations["validation"]:
        if results[instance['id']]== "Entailment":
            results_pred.append(1)
        else:
            results_pred.append(0)
        if instance["label"] == "Entailment":
            gold_labels.append(1)
        else:
            gold_labels.append(0)

    f_score = f1_score(gold_labels,results_pred)
    p_score = precision_score(gold_labels,results_pred)
    r_score = recall_score(gold_labels,results_pred)

    print('F1:{:f}'.format(f_score))
    print('precision_score:{:f}'.format(p_score))
    print('recall_score:{:f}'.format(r_score))

if '__main__' == __name__:
    main()

F1:0.502415
precision_score:0.485981
recall_score:0.520000


In [20]:
print(results)
with open("results123.json",'w') as jsonFile:
    jsonFile.write(json.dumps(results,indent=4))

{'00822101-fa21-4a26-bf45-3b1f535da005': 'Contradiction', '0131ebe3-746b-45a5-ba60-fdd326974fec': 'Contradiction', '015dd869-db90-45a9-a951-9dc7dc2b2e01': 'Entailment', '016943fa-57ea-4293-8131-03292ea1836c': 'Contradiction', '045ad91e-44fa-42cc-ac22-a7aab26a9009': 'Entailment', '06f798d1-2956-4903-83e9-58a41fa425f7': 'Entailment', '06fad978-1dc3-46f5-b45f-5ac6577f28b9': 'Entailment', '07042b0f-d422-49c3-8d53-81319927a4b3': 'Entailment', '080da381-b5dd-46d3-99c4-18112da153c1': 'Entailment', '09fc4356-6154-427e-b710-ed5df6470aff': 'Contradiction', '0a079cca-abde-42cf-9cec-a76e7a380e14': 'Entailment', '0a6d1b4c-244e-44e2-a229-62e4cbdfa979': 'Contradiction', '0b6cc8e3-69ee-4a91-b93d-2ad3fddce65f': 'Contradiction', '0cef8c8e-7986-46c7-a597-c5733a9899c0': 'Contradiction', '0ddb07ae-6e70-436d-8723-f609e59c57da': 'Entailment', '0e609fad-9a28-4fb6-90b6-32a731e3b02c': 'Entailment', '0ea29bd5-20be-40ea-8ad7-705d283d1d48': 'Contradiction', '1180dd58-61b8-4c48-ab00-7d458e68c85b': 'Contradiction', 

Compute F1 score, Precision, and Recall. Note that in the final evaluation systems will be ranked by Faithfulness and Consistency, which cannot be computed on the training and development set.