In [22]:
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [23]:
def load_rxnconso(file_path):
    """
    Load the RXNCONSO.RRF file into a pandas DataFrame.

    Args:
    file_path (str): The path to the RXNCONSO.RRF file.

    Returns:
    pd.DataFrame: A DataFrame containing the RXNCONSO data.
    """
    column_names = [
        "RXCUI",
        "LAT",
        "TS",
        "LUI",
        "STT",
        "SUI",
        "ISPREF",
        "RXAUI",
        "SAUI",
        "SCUI",
        "SDUI",
        "SAB",
        "TTY",
        "CODE",
        "STR",
        "SRL",
        "SUPPRESS",
        "CVF",
    ]
    df = pd.read_csv(file_path, sep="|", names=column_names, index_col=False)
    return df

In [24]:
# Load the data
file_path = "data/RXNCONSO.RRF"
rxnconso_df = load_rxnconso(file_path)

rxnconso_df.head()

  df = pd.read_csv(file_path, sep="|", names=column_names, index_col=False)


Unnamed: 0,RXCUI,LAT,TS,LUI,STT,SUI,ISPREF,RXAUI,SAUI,SCUI,SDUI,SAB,TTY,CODE,STR,SRL,SUPPRESS,CVF
0,3,ENG,,,,,,8717795,,58488005,,SNOMEDCT_US,PT,58488005,"1,4-alpha-Glucan branching enzyme",,N,
1,3,ENG,,,,,,8717796,,58488005,,SNOMEDCT_US,FN,58488005,"1,4-alpha-Glucan branching enzyme (substance)",,N,
2,3,ENG,,,,,,8717808,,58488005,,SNOMEDCT_US,SY,58488005,"Amylo-(1,4,6)-transglycosylase",,N,
3,3,ENG,,,,,,8718164,,58488005,,SNOMEDCT_US,SY,58488005,Branching enzyme,,N,
4,19,ENG,,,,,,10794494,,112116001,,SNOMEDCT_US,SY,112116001,17-hydrocorticosteroid,,N,


In [25]:
# print number of unique RxCUI values, brand names and generic names
print("Number of unique RxCUI values:", rxnconso_df["RXCUI"].nunique())
print(
    "Number of unique brand names:",
    rxnconso_df[rxnconso_df["TTY"] == "BN"]["STR"].nunique(),
)
print(
    "Number of unique generic names:",
    rxnconso_df[rxnconso_df["TTY"] == "IN"]["STR"].nunique(),
)

Number of unique RxCUI values: 392422
Number of unique brand names: 29923
Number of unique generic names: 53153


#### Term types information

Click the drop down toggle below for more information on the term types used in this task.

<details>
<summary>Click to expand</summary>
The unique term types (TTY) in the RxNorm database represent various types of names and identifiers for drugs and related concepts. Here is an explanation of what each abbreviation means:

- **Preferred Term**: The main name for a concept, which is usually the most commonly used name.
- **FN - Fully Specified Name**: A complete and detailed name for a concept.
- **SY - Synonym**: An alternative name for a concept.
- **BN - Brand Name**: The proprietary name given by a manufacturer.
- **IN - Ingredient Name**: The generic name for a drug ingredient.
- **SU - Substance**: A more general term than an ingredient, often used for chemicals and compounds.
- **GN - Generic Name**: The non-proprietary name assigned to a drug or drug component.
- **FSY - Formulated Synonym**: A synonym specific to a particular formulation.
- **MS - Multum Specific Drug Name**: A name specific to Multum, a drug database.
- **PIN - Precise Ingredient**: A more specific term for a particular ingredient.
- **TMSY - Tall Man Lettering Synonym**: A synonym that uses Tall Man lettering to reduce confusion with similar drug names.
- **SYGB - Synonym Generated by Algorithm**: A synonym generated by an algorithm.
- **RXN_PT - RxNorm Preferred Term**: A preferred term specific to RxNorm.
- **DF - Dose Form**: The form in which the drug is administered, such as tablet, capsule, etc.
- **PTGB - Preferred Term Generated by Algorithm**: A preferred term generated by an algorithm.
- **RXN_IN - RxNorm Ingredient Name**: An ingredient name specific to RxNorm.
- **CDA - Clinical Drug Alias**: An alias for a clinical drug.
- **CDC - Clinical Drug Component**: A component of a clinical drug.
- **CDD - Clinical Drug Description**: A description of a clinical drug.
- **SC - Semantic Clinical Drug Component**: A clinical drug component with semantic information.
- **AB - Abbreviation**: A short form or abbreviation for a concept.
- **CD - Clinical Drug**: A drug concept used in a clinical setting.
- **ET - Entry Term**: A term used for entry in a database.
- **BD - Brand Drug**: A branded version of a drug.
- **MIN - Multiple Ingredient**: A drug that contains multiple ingredients.
- **SCDF - Semantic Clinical Dose Form**: The dose form of a clinical drug with semantic information.
- **SBDF - Semantic Branded Dose Form**: The dose form of a branded drug with semantic information.
- **SCD - Semantic Clinical Drug**: A clinical drug concept with semantic information.
- **DP - Display Name**: A name used for display purposes.
- **PSN - Precise Synonym**: A synonym that is more precise than a general synonym.
- **SBD - Semantic Branded Drug**: A branded drug concept with semantic information.
- **SCDC - Semantic Clinical Drug Component**: A clinical drug component with detailed semantic information.
- **MTH_RXN_DP - Metathesaurus RxNorm Display Name**: A display name from the Metathesaurus specific to RxNorm.
- **MTH_RXN_CD - Metathesaurus RxNorm Clinical Drug**: A clinical drug concept from the Metathesaurus specific to RxNorm.
- **MTH_RXN_BD - Metathesaurus RxNorm Brand Drug**: A brand drug concept from the Metathesaurus specific to RxNorm.
- **MTH_RXN_CDC - Metathesaurus RxNorm Clinical Drug Component**: A clinical drug component from the Metathesaurus specific to RxNorm.
- **SBDC - Semantic Branded Drug Component**: A branded drug component with detailed semantic information.
- **GPCK - Generic Pack**: A generic version of a pack (a combination of drugs or components).
- **BPCK - Branded Pack**: A branded version of a pack (a combination of drugs or components).
- **DFG - Dose Form Group**: A group of dose forms.
- **SCDG - Semantic Clinical Dose Group**: A dose form group with semantic information specific to clinical drugs.
- **SBDG - Semantic Branded Dose Group**: A dose form group with semantic information specific to branded drugs.
- **SCDFP - Semantic Clinical Dose Form Precise**: A precise dose form of a clinical drug with semantic information.
- **SBDFP - Semantic Branded Dose Form Precise**: A precise dose form of a branded drug with semantic information.
- **SCDGP - Semantic Clinical**
</details>


In [26]:
# print unique Term Types
print("Unique Term Types:", rxnconso_df["TTY"].unique())

# for each term type, print number of unique strings
for term_type in rxnconso_df["TTY"].unique():
    print(
        f"Number of unique strings for term type '{term_type}':",
        rxnconso_df[rxnconso_df["TTY"] == term_type]["STR"].nunique(),
    )

Unique Term Types: ['PT' 'FN' 'SY' 'BN' 'IN' 'SU' 'GN' 'FSY' 'MS' 'PIN' 'TMSY' 'SYGB'
 'RXN_PT' 'DF' 'PTGB' 'RXN_IN' 'CDA' 'CDC' 'CDD' 'SC' 'AB' 'CD' 'ET' 'BD'
 'MIN' 'SCDF' 'SBDF' 'SCD' 'DP' 'PSN' 'SBD' 'SCDC' 'MTH_RXN_DP'
 'MTH_RXN_CD' 'MTH_RXN_BD' 'MTH_RXN_CDC' 'SBDC' 'GPCK' 'BPCK' 'DFG' 'SCDG'
 'SBDG' 'SCDFP' 'SBDFP' 'SCDGP']
Number of unique strings for term type 'PT': 47174
Number of unique strings for term type 'FN': 45228
Number of unique strings for term type 'SY': 102356
Number of unique strings for term type 'BN': 29923
Number of unique strings for term type 'IN': 53153
Number of unique strings for term type 'SU': 22799
Number of unique strings for term type 'GN': 4093
Number of unique strings for term type 'FSY': 4385
Number of unique strings for term type 'MS': 15073
Number of unique strings for term type 'PIN': 3467
Number of unique strings for term type 'TMSY': 26770
Number of unique strings for term type 'SYGB': 736
Number of unique strings for term type 'RXN_PT': 979
N

In [27]:
def pair_brand_generic_names(df):
    """
    Pair brand and generic names from the RXNCONSO DataFrame.

    Args:
    df (pd.DataFrame): The DataFrame containing RXNCONSO data.

    Returns:
    pd.DataFrame: A DataFrame with paired brand and generic names.
    """
    # Filter out relevant TTY (Term Type) for brand and generic names
    brand_names = df[df["TTY"] == "BN"].copy()  # BN: Brand Name
    generic_names = df[
        df["TTY"].isin(["IN", "PT"])
    ].copy()  # IN: Ingredient Name, PT: Preferred Term

    # Normalize strings for comparison
    brand_names["BrandName_normalized"] = brand_names["STR"].str.strip().str.lower()
    generic_names["GenericName_normalized"] = (
        generic_names["STR"].str.strip().str.lower()
    )

    # Ensure unique generic names per RxCUI, prioritizing PT over IN
    generic_names.sort_values(by=["RXCUI", "TTY"], inplace=True)
    generic_names_unique = generic_names.drop_duplicates(subset=["RXCUI"], keep="first")

    # Merge brand and generic names on RXCUI
    paired_names = pd.merge(
        brand_names, generic_names_unique, on="RXCUI", suffixes=("_brand", "_generic")
    )

    # Keep only the necessary columns
    paired_names = paired_names[["RXCUI", "STR_brand", "STR_generic"]]
    paired_names.columns = ["rxcui", "brand", "generic"]

    # Remove rows where normalized BrandName and GenericName are identical
    paired_names = paired_names[
        paired_names["brand"].str.strip().str.lower()
        != paired_names["generic"].str.strip().str.lower()
    ]
    # Remove rows where the brand name contains the generic name
    paired_names = paired_names[
        ~paired_names.apply(
            lambda row: row["generic"].strip().lower() in row["brand"].strip().lower(),
            axis=1,
        )
    ]

    # confirm that the brand and generic names are lowercase
    paired_names["brand"] = paired_names["brand"].str.lower()
    paired_names["generic"] = paired_names["generic"].str.lower()

    return paired_names

In [28]:
# Pair brand and generic names
paired_names = pair_brand_generic_names(rxnconso_df)
print(paired_names.tail(10))

        rxcui                                              brand  \
2134  1997443                    cytomegalovirus immune globulin   
2135  2001357                      acetaminophen-benzhydrocodone   
2136  2004040            insulin aspart-insulin aspart protamine   
2140  2064610  ethinyl estradiol-levonorgestrel with ferrous ...   
2149  2467748                water for injection, bacteriostatic   
2151  2467748                       sterile diluent for berinert   
2152  2467748                       water for injection, sterile   
2156  2535748                             ezetimibe-rosuvastatin   
2158  2557234                          casirivimab and imdevimab   
2159  2586016                                      ga 68 psma-11   

                                                generic  
2134              human cytomegalovirus immune globulin  
2135                      acetaminophen/benzhydrocodone  
2136  insulin aspart protamine- and insulin aspart-c...  
2140  ethinyl estra

In [29]:
# print number of unique RxCUI values, brand names and generic names
print("Number of unique RxCUI values:", paired_names["rxcui"].nunique())
print("Number of unique brand names:", paired_names["brand"].nunique())
print("Number of unique generic names:", paired_names["generic"].nunique())

Number of unique RxCUI values: 467
Number of unique brand names: 532
Number of unique generic names: 467


In [30]:
# Check if any RxCUI has more than one unique generic name
for rxcui, group in paired_names.groupby("rxcui"):
    if len(group["generic"].unique()) > 1:
        print(f"RxCUI {rxcui} has multiple generic names:")
        print(group)
        print()

In [31]:
# Check if any RxCUI has more than one unique brand name
for rxcui, group in paired_names.groupby("rxcui"):
    if len(group["brand"].unique()) > 1:
        print(f"RxCUI {rxcui} has multiple brand names:")
        print(group)
        print()

RxCUI 161 has multiple brand names:
   rxcui            brand        generic
6    161  apap (obsolete)  acetaminophen
8    161      paracetamol  acetaminophen

RxCUI 2418 has multiple brand names:
     rxcui       brand          generic
122   2418          d3  cholecalciferol
124   2418  vitamin d3  cholecalciferol

RxCUI 3143 has multiple brand names:
     rxcui                   brand     generic
164   3143  dehydroepiandrosterone  prasterone
165   3143                    dhea  prasterone

RxCUI 4018 has multiple brand names:
     rxcui                  brand         generic
207   4018             calciferol  ergocalciferol
209   4018  vitamin d2 (obsolete)  ergocalciferol
210   4018             vitamin d2  ergocalciferol

RxCUI 5499 has multiple brand names:
     rxcui                 brand            generic
282   5499  peroxide d'hydrogene  hydrogen peroxide
283   5499  peroxyde d'hydrogene  hydrogen peroxide

RxCUI 6205 has multiple brand names:
     rxcui                     bra

In [32]:
# remove any row where name contains obsolete
paired_names = paired_names[~paired_names["brand"].str.contains("obsolete")]
paired_names = paired_names[~paired_names["generic"].str.contains("obsolete")]

In [33]:
# print number of unique RxCUI values, brand names and generic names
print("Number of unique RxCUI values:", paired_names["rxcui"].nunique())
print("Number of unique brand names:", paired_names["brand"].nunique())
print("Number of unique generic names:", paired_names["generic"].nunique())

Number of unique RxCUI values: 464
Number of unique brand names: 524
Number of unique generic names: 464


In [34]:
paired_names.head(100)

Unnamed: 0,rxcui,brand,generic
1,74,paba,aminobenzoic acid
2,94,5-htp,oxitriptan
8,161,paracetamol,acetaminophen
16,272,active carbon,activated charcoal
22,376,corticotropin,"corticotropin, repository"
51,1151,vitamin c,ascorbic acid
52,1191,acetylsalicylic acid,aspirin
54,1199,astringent,astringents
72,1406,benzoin,benzoin resin
83,1754,bromide,bromides


In [35]:
# Save the paired names to a CSV file
paired_names.to_csv("paired_brand_generic_names.csv", index=False)