## GOAL 1 : Have a list of conept ID and their synonyms to play with it for MCQ questions

In [1]:
# Define the input file path
input_file = 'MRCONSO_ENG.txt'

# Initialize a dictionary to hold the concept IDs and their synonyms/preferred terms
concept_dict = {}

# Open the filtered file and process each line
with open(input_file, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line by '|'
        fields = line.strip().split('|')
        
        # Extract the relevant fields
        concept_id = fields[0]  # CUI
        term = fields[14]  # The synonym or preferred term
        
        # Add the term to the dictionary under the corresponding concept ID
        if concept_id in concept_dict:
            concept_dict[concept_id].append(term)
        else:
            concept_dict[concept_id] = [term]



# Optionally, write the dictionary to an output file for future use
# output_file = 'path/to/concept_terms.txt'
# with open(output_file, 'w', encoding='utf-8') as out_file:
#     for concept_id, terms in concept_dict.items():
#         out_file.write(f'{concept_id}: {", ".join(terms)}\n')

# print(f"Concept terms written to {output_file}")


### NOW I have a dictionnary with the unique concept ID and their correspondig synonyms


In [None]:
# Print the resulting dictionary (or process it further as needed)
for concept_id, terms in concept_dict.items():
    print(f'{concept_id}: {terms}')
    
'''
The output will look like this:
C0000005: ['2-Methoxyestradiol', '2-Methoxyoestradiol', '2-MeO-E2', '2-MeO-Estradiol', '2-Methoxy-17 beta-estradiol', '2-Methoxy-17 beta-oestradiol', '2-Methoxy-17beta-estradiol', '2-Methoxy-17beta-oestradiol', '2-Methoxyestradiol']
C0000008: ['N-Methylaspartate', 'N-Methyl-D-aspartate', 'N-Methyl-D-aspartic acid', 'N-Methyl-D-aspartate', 'N-Methyl-D-aspartic acid', 'N-Methyl-D-aspartate']
'''

### Let's now filter those concept with only drug semantic types : drug_semantic_types = [
    'T121',  # Pharmacologic Substance
    'T200',  # Clinical Drug
]

In [4]:
# how many unique concept IDs are there?
print(f"Number of unique concept IDs: {len(concept_dict)}")

Number of unique concept IDs: 3210943


In [5]:
# Define the input file path for MRSTY.RRF
mrsty_file = 'MRSTY.RRF'

# Define the desired drug semantic types
drug_semantic_types = ['T121', 'T200']

# Step 1: Create a set of concept IDs with the desired semantic types
drug_concept_ids = set()

# Read the MRSTY.RRF file and collect concept IDs with desired semantic types
with open(mrsty_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        concept_id = fields[0]
        semantic_type = fields[1]
        if semantic_type in drug_semantic_types:
            drug_concept_ids.add(concept_id)

# Step 2: Filter the concept dictionary to include only concept IDs in drug_concept_ids
filtered_concept_dict = {concept_id: terms for concept_id, terms in concept_dict.items() if concept_id in drug_concept_ids}

# # Print the filtered dictionary (or process it further as needed)
# for concept_id, terms in filtered_concept_dict.items():
#     print(f'{concept_id}: {terms}')

# # Optionally, write the filtered dictionary to an output file for future use
# output_file = 'path/to/filtered_concept_terms.txt'
# with open(output_file, 'w', encoding='utf-8') as out_file:
#     for concept_id, terms in filtered_concept_dict.items():
#         out_file.write(f'{concept_id}: {", ".join(terms)}\n')

# print(f"Filtered concept terms written to {output_file}")


In [6]:
# size of the filtered dictionary
print(f"Number of unique concept IDs with drug semantic types: {len(filtered_concept_dict)}")

Number of unique concept IDs with drug semantic types: 326214


In [8]:
# print the first 10 concept IDs and their terms
for i, (concept_id, terms) in enumerate(filtered_concept_dict.items()):
    print(f'{concept_id}: {terms}')
    if i == 9:
        break

C0000005: ['(131)I-Macroaggregated Albumin', '(131)I-MAA']
C0000039: ['1,2-dipalmitoylphosphatidylcholine', '1,2-dipalmitoylphosphatidylcholine', '1,2-Dipalmitoylphosphatidylcholine', '1,2 Dipalmitoylphosphatidylcholine', '1,2-Dihexadecyl-sn-Glycerophosphocholine', '1,2 Dihexadecyl sn Glycerophosphocholine', '1,2-Dipalmitoyl-Glycerophosphocholine', '1,2 Dipalmitoyl Glycerophosphocholine', 'Dipalmitoylphosphatidylcholine', 'Dipalmitoylphosphatidylcholine', 'Dipalmitoylphosphatidylcholine', 'Dipalmitoylphosphatidylcholine', 'Dipalmitoylphosphatidylcholine', 'Dipalmitoylglycerophosphocholine', 'Dipalmitoyllecithin', '3,5,9-Trioxa-4-phosphapentacosan-1-aminium, 4-hydroxy-N,N,N-trimethyl-10-oxo-7-((1-oxohexadecyl)oxy)-, inner salt, 4-oxide', 'Dipalmitoyl Phosphatidylcholine', 'Phosphatidylcholine, Dipalmitoyl']
C0000096: ['1-Methyl-3-isobutylxanthine', '1 Methyl 3 isobutylxanthine', '3-Isobutyl-1-methylxanthine', '3 Isobutyl 1 methylxanthine', 'IBMX', 'Isobutyltheophylline', '1H-Purine-2,6-

In [7]:
# store the filtered dictionary in a file
output_file = 'filtered_by_concept_terms.txt'
with open(output_file, 'w', encoding='utf-8') as out_file:
    for concept_id, terms in filtered_concept_dict.items():
        out_file.write(f'{concept_id}: {", ".join(terms)}\n')

## GOAL 2 : Retrieve relationships between those 

In [9]:
# Define the input file path for MRREL.RRF
mrrel_file = 'MRREL.RRF'

# Initialize a set to hold unique relationship types
unique_relationships = set()

# Read the MRREL.RRF file and collect unique relationship types
with open(mrrel_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        relationship_type = fields[7]  # Relationship type is in the 8th field (index 7)
        unique_relationships.add(relationship_type)




I have all the uniuqe relationships

In [15]:
output_file = 'unique_relationships.txt'
with open(output_file, 'w', encoding='utf-8') as out_file:
    for relationship in unique_relationships:
        out_file.write(f'{relationship}\n')

print(f"Unique relationship types written to {output_file}")

Unique relationship types written to unique_relationships.txt


#### We are interested by only some relations so we filter the MRREL file : 

In [1]:
# Define the input and output file paths
mrrel_file = 'MRREL.RRF'
output_file = 'filtered_MRREL_by_relation.txt'

# Define the desired semantic relationship types
desired_relationship_types = {
    'has_contraindicated_class',
    'contraindicated_mechanism_of_action_of',
    'contraindicated_with_disease',
    'has_contraindicated_drug',
    'contraindicated_class_of',
    'has_contraindicated_class',
    'has_contraindicated_physiologic_effect',
    'has_contraindicated_mechanism_of_action',
    "effect_may_be_inhibited_by"
}

# Open the output file for writing
with open(output_file, 'w', encoding='utf-8') as out_file:
    # Read the MRREL.RRF file and filter lines based on the desired relationship types
    with open(mrrel_file, 'r', encoding='utf-8') as file:
        for line in file:
            fields = line.strip().split('|')
            relationship_type = fields[7]  # Relationship type is in the 8th field (index 7)
            if relationship_type in desired_relationship_types:
                out_file.write(line)

print(f"Filtered relationship types written to {output_file}")


Filtered relationship types written to filtered_MRREL_by_relation.txt


In [2]:
# check the len of the filtered relationship types
filtered_relationships = 'filtered_MRREL_by_relation.txt'
with open(filtered_relationships, 'r', encoding='utf-8') as file:
    count = sum(1 for line in file)
    
print(f"Number of filtered relationship types: {count}") 

Number of filtered relationship types: 27714


####  Now we have the concept ids of intereest and the relation type of interest but maybe some relations points out concept that we are not interested so we'll remove them 

In [39]:
# Define file paths
filtered_by_concept_file = 'filtered_by_concept_terms.txt'
filtered_mrrel_file = 'filtered_MRREL_by_relation.txt'
output_file = 'MR_REL_relation_interest_121_200.txt'

# Load concept IDs from filtered_by_concept_terms.txt into a set
filtered_concept_ids = set()
with open(filtered_by_concept_file, 'r', encoding='utf-8') as concept_file:
    for line in concept_file:
        concept_id = line.split(':')[0].strip()  # Assuming format is 'concept_id: term1, term2, ...'
        filtered_concept_ids.add(concept_id)

# Open the output file for writing
with open(output_file, 'w', encoding='utf-8') as out_file:
    # Read filtered_MRREL_by_relation.txt and filter lines based on concept IDs
    with open(filtered_mrrel_file, 'r', encoding='utf-8') as rel_file:
        for line in rel_file:
            fields = line.strip().split('|')
            concept1_id = fields[0]
            concept2_id = fields[4]
            if concept1_id in filtered_concept_ids and concept2_id in filtered_concept_ids:
                out_file.write(line)

print(f"Filtered relationships of interest written to {output_file}")


Filtered relationships of interest written to MR_REL_relation_interest_121_200.txt


In [40]:
# size of the filtered relationship types
filtered_relationships = 'MR_REL_relation_interest_121_200.txt'
with open(filtered_relationships, 'r', encoding='utf-8') as file:
    count = sum(1 for line in file)
    
print(f"Number of filtered relationship types: {count}")

Number of filtered relationship types: 3062


### Finally we only have 4 relationships

In [41]:
# Define file path for the filtered relationships of interest
filtered_relationships_file = 'MR_REL_relation_interest_121_200.txt'

# Set to store unique relationship types
unique_relationship_types = set()

# Read the filtered relationships file and count unique relationship types
with open(filtered_relationships_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        relationship_type = fields[7]  # Assuming relationship type is in the 8th field (index 7)
        unique_relationship_types.add(relationship_type)

# Count of distinct relationship types
distinct_relationship_count = len(unique_relationship_types)

print(f"Number of distinct relationship types: {distinct_relationship_count}")
# what are the distinct relationship types?
for i, relationship_type in enumerate(unique_relationship_types):
    print(relationship_type)
    if i == 9:
        break


Number of distinct relationship types: 4
contraindicated_with_disease
has_contraindicated_drug
contraindicated_class_of
has_contraindicated_class


In [36]:
# show the first 10 relationships
with open(filtered_relationships_file, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        print(line.strip())
        if i == 9:
            break

C0001655|A31744212|SCUI|RO|C3494354|A20911324|SCUI|has_contraindicated_drug|R182890245||MED-RT|MED-RT||N|N||
C0002371|A31694655|SCUI|RO|C0304227|A15579450|SCUI|has_contraindicated_drug|R182877397||MED-RT|MED-RT||N|N||
C0002372|A31694657|SCUI|RO|C0304227|A15579450|SCUI|has_contraindicated_drug|R182909004||MED-RT|MED-RT||N|N||
C0003596|A31646471|SCUI|RO|C2936526|A18460829|SCUI|has_contraindicated_drug|R182900089||MED-RT|MED-RT||N|N||
C0010620|A31724503|SCUI|RO|C0016365|A31710755|SCUI|effect_may_be_inhibited_by|R176752452||MED-RT|MED-RT||Y|N||
C0010620|A31724503|SCUI|RO|C0733380|A31705980|SCUI|effect_may_be_inhibited_by|R176830217||MED-RT|MED-RT||Y|N||
C0016365|A31710755|SCUI|RO|C0010620|A31724503|SCUI|may_inhibit_effect_of|R176807000||MED-RT|MED-RT||N|N||
C0031507|A31643837|SCUI|RO|C0288165|A0914523|SCUI|has_contraindicated_drug|R182899561||MED-RT|MED-RT||N|N||
C0040778|A31698854|SCUI|RO|C0006644|A0013998|SCUI|has_contraindicated_drug|R182899648||MED-RT|MED-RT||N|N||
C0050559|A31686328|S

# Now we can generate sentences 

In [30]:
import random
import re

# Define file paths
filtered_concept_file = 'filtered_by_concept_terms.txt'
filtered_relationships_file = 'MR_REL_relation_interest_121_200.txt'
output_file = 'contraindication_sentence.txt'

# Dictionary to store concept IDs and their synonyms
concept_synonyms = {}

# Function to check if a string is purely numeric
def is_numeric(s):
    return s.isdigit()

# Read concept IDs and synonyms from filtered_by_concept_terms.txt
with open(filtered_concept_file, 'r', encoding='utf-8') as concept_file:
    for line in concept_file:
        parts = line.strip().split(':')
        concept_id = parts[0].strip()
        synonyms = [syn.strip() for syn in parts[1].split(',') if not is_numeric(syn.strip())]
        concept_synonyms[concept_id] = synonyms

# List to store generated sentences
generated_sentences = []

# Read filtered relationships and generate sentences
with open(filtered_relationships_file, 'r', encoding='utf-8') as rel_file:
    for line in rel_file:
        fields = line.strip().split('|')
        concept1_id = fields[0]
        concept2_id = fields[4]
        relationship_type = fields[7]

        if concept1_id in concept_synonyms and concept2_id in concept_synonyms:
            concept1_synonym = random.choice(concept_synonyms[concept1_id])
            concept2_synonym = random.choice(concept_synonyms[concept2_id])

            sentence = f"{concept1_synonym}, {relationship_type}, {concept2_synonym}"
            generated_sentences.append(sentence)

# Write generated sentences to output file
with open(output_file, 'w', encoding='utf-8') as out_file:
    for sentence in generated_sentences:
        out_file.write(f"{sentence}\n")

print(f"Generated contraindication sentences written to {output_file}")


Generated contraindication sentences written to contraindication_sentence.txt


In [43]:
notes :  - tester les autres relations  de Jack
          -voir les autres liens de jack (les databases ? ) qui sont sensés fournir plus de détails
         

SyntaxError: invalid syntax (Temp/ipykernel_10940/1360469868.py, line 1)