# FILTER MRCONSOENG.TXT file by CONCEPT matching T121/T200 => filtered_MRCONSO_ENG.txt

In [2]:
# Define the input file paths
mrsty_file = 'MRSTY.RRF'
mrconso_file = 'MRCONSO_ENG.txt'
output_file = 'filtered_MRCONSO_ENG.txt'

# Step 1: Load concept IDs and their types from MRSTY.RRF
desired_types = {'T121', 'T200'}
concept_types = {}

with open(mrsty_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        concept_id = fields[0]
        concept_type = fields[1]
        if concept_type in desired_types:
            concept_types[concept_id] = concept_type

# Step 2: Filter MRCONSO_ENG.txt to keep only the lines with desired concept types
with open(mrconso_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        fields = line.strip().split('|')
        concept_id = fields[0]
        if concept_id in concept_types:
            outfile.write(line)

print(f"Filtered entries have been written to {output_file}")


Filtered entries have been written to filtered_MRCONSO_ENG.txt


In [3]:
# count the number of lines in the output file
with open(output_file, 'r', encoding='utf-8') as file:
    line_count = sum(1 for line in file)
    
print(f"Number of lines in the output file: {line_count}")

Number of lines in the output file: 1007713


In [4]:
# show sample lines from the output file
with open(output_file, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        print(line.strip())
        if i >= 2:
            break
        print('...')
    

        

C0000005|ENG|P|L0000005|PF|S0007492|Y|A26634265||M0019694|D012711|MSH|PEP|D012711|(131)I-Macroaggregated Albumin|0|N|256|
...
C0000005|ENG|S|L0270109|PF|S0007491|Y|A26634266||M0019694|D012711|MSH|ET|D012711|(131)I-MAA|0|N|256|
...
C0000039|ENG|P|L0000039|PF|S17175117|N|A28315139|9194921|1926948||RXNORM|IN|1926948|1,2-dipalmitoylphosphatidylcholine|0|N|256|


# Build a dictionnary that contains a concept and all its Atome id from filtered_MRCONSO_ENG.txt => concept_aui_dict.txt
dictionnary concept aui C000011111 [A79809U9,A1678...]


In [5]:
# Define the input file path
filtered_mrconso_file = 'filtered_MRCONSO_ENG.txt'
output_dict_file = 'concept_aui_dict.txt'

# Initialize the dictionary to hold concept IDs and their AUIs
concept_aui_dict = {}

# Read the filtered MRCONSO_ENG.txt file
with open(filtered_mrconso_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        fields = line.strip().split('|')
        concept_id = fields[0]  # CUI is at index 0
        aui = fields[7]         # AUI is at index 7

        if concept_id in concept_aui_dict:
            concept_aui_dict[concept_id].append(aui)
        else:
            concept_aui_dict[concept_id] = [aui]

# Optionally, write the dictionary to a file for future use
with open(output_dict_file, 'w', encoding='utf-8') as outfile:
    for concept_id, auis in concept_aui_dict.items():
        outfile.write(f"{concept_id}: {', '.join(auis)}\n")

print(f"Concept-AUI dictionary has been written to {output_dict_file}")


Concept-AUI dictionary has been written to concept_aui_dict.txt


In [9]:
# show sample entries from the concept-AUI dictionary
with open(output_dict_file, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        print(line.strip())
        if i >= 2:
            break
        print('...')
        
# count the number of entries in the concept-AUI dictionary
with open(output_dict_file, 'r', encoding='utf-8') as file:
    line_count = sum(1 for line in file)
    
print(f"Number of entries in the concept-AUI dictionary: {line_count}")
        
        

C0000005: A26634265, A26634266
...
C0000039: A28315139, A28572604, A0016515, A1317708, A26674543, A1317687, A26661070, A1317707, A0049238, A18399186, A23513030, A32917594, A26596366, A26607235, A26604498, A0528280, A26631676, A0100864
...
C0000096: A0526764, A0526362, A26647665, A0528180, A26666443, A26658464, A0527273
Number of entries in the concept-AUI dictionary: 326214


There are 326614 unique concepts

# TAKE THE LIST OF RELATIONSHIPS : 
induced_by
contraindicated_mechanism_of_action_of
has_risk_factor
may_be_prevented_by
risk_factor_of
may_prevent
has_pharmacokinetics
enzyme_metabolizes_chemical_or_drug
used_by
induces
has_contraindicated_mechanism_of_action
chemical_or_drug_plays_role_in_biological_process
has_related_factor
time_modifier_of
negatively_regulates
modified_by
associated_with
has_contraindicated_drug
uses_substance
has_excluded_associated_finding
chemical_or_drug_is_metabolized_by_enzyme
modifies
has_contraindicated_class
effect_may_be_inhibited_by
clinically_associated_with
has_contraindicated_physiologic_effect
related_to
substance_used_by
may_inhibit_effect_of
positively_regulates
is_object_guidance_for
uses
contraindicated_physiologic_effect_of

We will find all the relation of the list above concept to concept that can be found in the filtered_MRCONSO_ENG.txt  => matched_concept_pairs.tx

In [11]:
# Define the file paths
mrrel_file = 'MRREL.RRF'
filtered_mrconso_file = 'filtered_MRCONSO_ENG.txt'
output_pairs_file = 'matched_concept_pairs.txt'

# Load the specified relationships into a set
specified_relationships = {
    'induced_by',
    'contraindicated_mechanism_of_action_of',
    'has_risk_factor',
    'may_be_prevented_by',
    'risk_factor_of',
    'may_prevent',
    'has_pharmacokinetics',
    'enzyme_metabolizes_chemical_or_drug',
    'used_by',
    'induces',
    'has_contraindicated_mechanism_of_action',
    'chemical_or_drug_plays_role_in_biological_process',
    'has_related_factor',
    'time_modifier_of',
    'negatively_regulates',
    'modified_by',
    'associated_with',
    'has_contraindicated_drug',
    'uses_substance',
    'has_excluded_associated_finding',
    'chemical_or_drug_is_metabolized_by_enzyme',
    'modifies',
    'has_contraindicated_class',
    'effect_may_be_inhibited_by',
    'clinically_associated_with',
    'has_contraindicated_physiologic_effect',
    'related_to',
    'substance_used_by',
    'may_inhibit_effect_of',
    'positively_regulates',
    'is_object_guidance_for',
    'uses',
    'contraindicated_physiologic_effect_of'
}

# Load the unique concept IDs from the filtered MRCONSO file into a set
unique_concepts = set()
with open(filtered_mrconso_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        fields = line.strip().split('|')
        concept_id = fields[0]
        unique_concepts.add(concept_id)

# Initialize a set to hold the matched concept pairs
matched_pairs = set()

# Read the MRREL.RRF file and find pairs matching the specified relationships
with open(mrrel_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        fields = line.strip().split('|')
        concept1 = fields[0]  # CUI1
        concept2 = fields[4]  # CUI2
        relationship = fields[7]  # REL

        # Check if the relationship is one of the specified relationships
        if relationship in specified_relationships:
            # Check if both concepts are in the unique concepts set
            if concept1 in unique_concepts and concept2 in unique_concepts:
                # Add the pair to the matched pairs set
                matched_pairs.add((concept1, concept2, relationship))

# Write the matched pairs to the output file
with open(output_pairs_file, 'w', encoding='utf-8') as outfile:
    for pair in matched_pairs:
        outfile.write(f"{pair[0]}|{pair[1]}|{pair[2]}\n")

print(f"Matched concept pairs have been written to {output_pairs_file}")


Matched concept pairs have been written to matched_concept_pairs.txt


In [13]:
# count the number of lines in the output file
with open(output_pairs_file, 'r', encoding='utf-8') as file:
    line_count = sum(1 for line in file)
    
print(f"Number of lines in the output file: {line_count}")

Number of lines in the output file: 2465


In [14]:
# count the occurrences of each relationship type
relationship_counts = {}
with open(output_pairs_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        relationship = fields[2]
        if relationship in relationship_counts:
            relationship_counts[relationship] += 1
        else:
            relationship_counts[relationship] = 1
            
print("Relationship counts:")
for relationship, count in relationship_counts.items():
    print(f"{relationship}: {count}")
    
    

Relationship counts:
associated_with: 856
has_contraindicated_class: 1507
related_to: 50
has_contraindicated_drug: 24
uses_substance: 10
substance_used_by: 10
may_be_prevented_by: 2
may_inhibit_effect_of: 2
effect_may_be_inhibited_by: 2
may_prevent: 2


## NoW relation atoms atoms 

In [15]:
# Define the file paths
mrrel_file = 'MRREL.RRF'
filtered_mrconso_file = 'filtered_MRCONSO_ENG.txt'
output_atoms_file = 'atom_to_atom_relationships.txt'

# Load the specified relationships into a set
specified_relationships = {
    'induced_by',
    'contraindicated_mechanism_of_action_of',
    'has_risk_factor',
    'may_be_prevented_by',
    'risk_factor_of',
    'may_prevent',
    'has_pharmacokinetics',
    'enzyme_metabolizes_chemical_or_drug',
    'used_by',
    'induces',
    'has_contraindicated_mechanism_of_action',
    'chemical_or_drug_plays_role_in_biological_process',
    'has_related_factor',
    'time_modifier_of',
    'negatively_regulates',
    'modified_by',
    'associated_with',
    'has_contraindicated_drug',
    'uses_substance',
    'has_excluded_associated_finding',
    'chemical_or_drug_is_metabolized_by_enzyme',
    'modifies',
    'has_contraindicated_class',
    'effect_may_be_inhibited_by',
    'clinically_associated_with',
    'has_contraindicated_physiologic_effect',
    'related_to',
    'substance_used_by',
    'may_inhibit_effect_of',
    'positively_regulates',
    'is_object_guidance_for',
    'uses',
    'contraindicated_physiologic_effect_of'
}

# Initialize a dictionary to hold concept IDs and their AUIs
concept_aui_dict = {}

# Read the filtered MRCONSO_ENG.txt file and populate the dictionary
with open(filtered_mrconso_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        fields = line.strip().split('|')
        concept_id = fields[0]  # CUI is at index 0
        aui = fields[7]         # AUI is at index 7

        if concept_id in concept_aui_dict:
            concept_aui_dict[concept_id].append(aui)
        else:
            concept_aui_dict[concept_id] = [aui]

# Create a set of all AUIs for quick look-up
all_auis = {aui for auis in concept_aui_dict.values() for aui in auis}

# Initialize a set to hold the matched atom-to-atom relationships
matched_atom_pairs = set()

# Read the MRREL.RRF file and find atom-to-atom relationships matching the specified relationships
with open(mrrel_file, 'r', encoding='utf-8') as infile:
    for line in infile:
        fields = line.strip().split('|')
        aui1 = fields[1]  # AUI1 is at index 1
        aui2 = fields[5]  # AUI2 is at index 5
        relationship = fields[7]  # REL is at index 7

        # Check if the relationship is one of the specified relationships and both AUIs are in the dictionary
        if relationship in specified_relationships and aui1 in all_auis and aui2 in all_auis:
            # Add the atom-to-atom relationship to the matched pairs set
            matched_atom_pairs.add((aui1, aui2, relationship))

# Write the matched atom-to-atom relationships to the output file
with open(output_atoms_file, 'w', encoding='utf-8') as outfile:
    for pair in matched_atom_pairs:
        outfile.write(f"{pair[0]}|{pair[1]}|{pair[2]}\n")

print(f"Matched atom-to-atom relationships have been written to {output_atoms_file}")


Matched atom-to-atom relationships have been written to atom_to_atom_relationships.txt


In [16]:
# count the number of lines in the output file
with open(output_atoms_file, 'r', encoding='utf-8') as file:
    line_count = sum(1 for line in file)
    
print(f"Number of lines in the output file: {line_count}")

Number of lines in the output file: 2811


In [17]:
# count the occurrences of each relationship type
relationship_counts = {}
with open(output_atoms_file, 'r', encoding='utf-8') as file:
    for line in file:
        fields = line.strip().split('|')
        relationship = fields[2]
        if relationship in relationship_counts:
            relationship_counts[relationship] += 1
        else:
            relationship_counts[relationship] = 1
            
print("Relationship counts:")

for relationship, count in relationship_counts.items():
    print(f"{relationship}: {count}")
    


Relationship counts:
has_contraindicated_class: 1507
associated_with: 1202
related_to: 50
substance_used_by: 10
effect_may_be_inhibited_by: 2
uses_substance: 10
has_contraindicated_drug: 24
may_prevent: 2
may_be_prevented_by: 2
may_inhibit_effect_of: 2
