In [1]:
import pandas as pd

## Drugs Datasets

### Load Datasets

#### Wikidata Dataset

In [2]:
drugs1_df = pd.read_csv("Wikidata/drugs.tsv", delimiter="\t")

# FORMAT
drugs1_df.Name = drugs1_df.Name.str.lower().str.strip()

drugs1_df.drop_duplicates(subset=["DrugBank ID", "Name"], inplace=True)

drugs1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1542 entries, 0 to 1545
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   DrugBank ID    1542 non-null   object
 1   Name           1542 non-null   object
 2   SMILES         1308 non-null   object
 3   InChI          1308 non-null   object
 4   Generic Names  1308 non-null   object
 5   Wikidata URL   1542 non-null   object
dtypes: object(6)
memory usage: 84.3+ KB


In [3]:
drugs1_df.nunique().compare(drugs1_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count
DrugBank ID,1540,1542
SMILES,1286,1308


In [4]:
drugs1_df[drugs1_df["DrugBank ID"].duplicated(keep=False)].groupby("DrugBank ID").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,DrugBank ID,Name,SMILES,InChI,Generic Names,Wikidata URL
DrugBank ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
727,665,727,pharmaceutical preparation of nitroglycerin,,,,http://www.wikidata.org/entity/Q3221044
727,986,727,nitroglycerin,C(C(CO[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-],InChI=1S/C3H5N3O9/c7-4(8)13-1-3(15-6(11)12)2-1...,NG ; Nitroglycerin ; Nitrostat ; glyceryl trin...,http://www.wikidata.org/entity/Q162867
1356,276,1356,lithium,,,,http://www.wikidata.org/entity/Q568
1356,1065,1356,lithium compounds,,,,http://www.wikidata.org/entity/Q152763


In [5]:
drugs1_df[drugs1_df["Name"].duplicated(keep=False)].groupby("Name").apply(lambda x: x)

Unnamed: 0_level_0,DrugBank ID,Name,SMILES,InChI,Generic Names,Wikidata URL
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [6]:
drugs1_df_duplicated_ID = drugs1_df[drugs1_df["DrugBank ID"].duplicated(keep=False)].groupby("DrugBank ID").agg({"Name": lambda x: list(x.astype(str)), "Generic Names": lambda x: list(x.astype(str))})
drugs1_df_duplicated_ID["Generic Names"] = drugs1_df_duplicated_ID["Generic Names"].add(drugs1_df_duplicated_ID.Name).apply(lambda x :" ; ".join(x))
drugs1_df_duplicated_ID = drugs1_df_duplicated_ID.explode("Name")

mapping_dict = drugs1_df_duplicated_ID.set_index("Name")["Generic Names"].to_dict()
drugs1_df["Generic Names"] = drugs1_df.Name.map(mapping_dict).fillna(drugs1_df["Generic Names"])
drugs1_df[drugs1_df["DrugBank ID"].duplicated(keep=False)].groupby("DrugBank ID").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,DrugBank ID,Name,SMILES,InChI,Generic Names,Wikidata URL
DrugBank ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
727,665,727,pharmaceutical preparation of nitroglycerin,,,nan ; NG ; Nitroglycerin ; Nitrostat ; glycery...,http://www.wikidata.org/entity/Q3221044
727,986,727,nitroglycerin,C(C(CO[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-],InChI=1S/C3H5N3O9/c7-4(8)13-1-3(15-6(11)12)2-1...,nan ; NG ; Nitroglycerin ; Nitrostat ; glycery...,http://www.wikidata.org/entity/Q162867
1356,276,1356,lithium,,,nan ; nan ; lithium ; lithium compounds,http://www.wikidata.org/entity/Q568
1356,1065,1356,lithium compounds,,,nan ; nan ; lithium ; lithium compounds,http://www.wikidata.org/entity/Q152763


In [7]:
# REMOVE DUPLICATES
drugs1_df.drop_duplicates(subset=["DrugBank ID"], inplace=True)
drugs1_df.drop_duplicates(subset=["Name"], inplace=True)

drugs1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1540 entries, 0 to 1545
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   DrugBank ID    1540 non-null   object
 1   Name           1540 non-null   object
 2   SMILES         1307 non-null   object
 3   InChI          1307 non-null   object
 4   Generic Names  1309 non-null   object
 5   Wikidata URL   1540 non-null   object
dtypes: object(6)
memory usage: 84.2+ KB


#### PharmGKB Dataset

In [8]:
drugs2_df = pd.read_csv("PharmGKB/primary_data/drugs/drugs.tsv", delimiter="\t")

# FORMAT
drugs2_df.Name = drugs2_df.Name.str.lower().str.strip()

drugs2_df.drop_duplicates(subset=["PharmGKB Accession Id", "Name"], inplace=True)

drugs2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3550 entries, 0 to 3549
Data columns (total 24 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   PharmGKB Accession Id             3550 non-null   object
 1   Name                              3550 non-null   object
 2   Generic Names                     1574 non-null   object
 3   Trade Names                       1670 non-null   object
 4   Brand Mixtures                    254 non-null    object
 5   Type                              3550 non-null   object
 6   Cross-references                  2138 non-null   object
 7   SMILES                            1681 non-null   object
 8   InChI                             1681 non-null   object
 9   Dosing Guideline                  3550 non-null   object
 10  External Vocabulary               3055 non-null   object
 11  Clinical Annotation Count         3550 non-null   int64 
 12  Variant Annotation C

In [9]:
drugs2_df.nunique().compare(drugs2_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count
Generic Names,1562,1574
Trade Names,1669,1670
Brand Mixtures,247,254
Type,8,3550
SMILES,1672,1681
InChI,1672,1681
Dosing Guideline,2,3550
External Vocabulary,3005,3055
Clinical Annotation Count,138,3550
Variant Annotation Count,273,3550


In [10]:
# REMOVE DUPLICATES
drugs2_df.drop_duplicates(subset=["PharmGKB Accession Id"], inplace=True)
drugs2_df.drop_duplicates(subset=["Name"], inplace=True)

drugs2_df.head()

Unnamed: 0,PharmGKB Accession Id,Name,Generic Names,Trade Names,Brand Mixtures,Type,Cross-references,SMILES,InChI,Dosing Guideline,...,VIP Count,Dosing Guideline Sources,Top Clinical Annotation Level,Top FDA Label Testing Level,Top Any Drug Label Testing Level,Label Has Dosing Info,Has Rx Annotation,RxNorm Identifiers,ATC Identifiers,PubChem Compound Identifiers
0,PA166238901,17-alpha-dihydroequilenin sulfate,,,,Drug,PubChem Compound:23671798,,,No,...,0,,,,,,,,,23671798
1,PA166238881,17-alpha-dihydroequilin,,,,Drug,PubChem Compound:9547222,,,No,...,0,,,,,,,,,9547222
2,PA166238883,17-alpha-dihydroequilin sulfate,,,,Drug,PubChem Compound:23671797,,,No,...,0,,,,,,,,,23671797
3,PA166238882,17-alpha-estradiol sulfate,,,,Drug,PubChem Compound:23671799,,,No,...,0,,,,,,,,,23671799
4,PA166238921,17-beta-dihydroequilenin sulfate,,,,Drug,PubChem Compound:71316159,,,No,...,0,,,,,,,,,71316159


### Merge Datasets

In [11]:
def format_semicolon_as_separator(names):
    if pd.isna(names): return
    old_str = '", ' if names.startswith('"') and names.endswith('"') else ', ' 
    return names.replace(old_str, " ; ").replace('"', '')

#### 1<sup>st</sup> Strategy

In [12]:
all_drugs_df = pd.merge(
    drugs1_df, drugs2_df, on="Name", how="outer", indicator=True, suffixes=["_x", ""]
)

all_drugs_df["Alternate_Names"] = (
    all_drugs_df["Generic Names"]
    .apply(format_semicolon_as_separator)
    .fillna(all_drugs_df["Generic Names_x"])
    .add(" ; ")
    .fillna("")
    .add(all_drugs_df.Name)
    .str.strip()
    .str.replace('"', '')
    .str.lower()
)

all_drugs_df.drop(
    columns=all_drugs_df.columns.difference(
        ["DrugBank ID", "PharmGKB Accession Id", "Name", "Alternate_Names", "_merge"]
    ),
    inplace=True,
)

all_drugs_df._merge.value_counts()

_merge
right_only    2413
both          1137
left_only      403
Name: count, dtype: int64

#### 2<sup>nd</sup> Strategy

In [13]:
all_drugs_df_exploded_not_both: dict[str, pd.DataFrame] = {
    group_name: group_data
    for group_name, group_data in all_drugs_df.groupby("_merge")
    if group_name != "both"
}

columns = {
    "left_only": ["Name", "DrugBank ID", "Alternate_Names"],
    "right_only": ["Name", "PharmGKB Accession Id", "Alternate_Names"],
}



for group_name, group_data in all_drugs_df_exploded_not_both.items():
    group_data.Alternate_Names = group_data.Alternate_Names.str.split(" ; ")
    group_data = group_data.explode("Alternate_Names")

    group_data.Alternate_Names = group_data.Alternate_Names.str.strip()

    group_data.drop_duplicates(subset=["Alternate_Names"], inplace=True)
    group_data.drop(
        columns=all_drugs_df.columns.difference(columns[group_name]), inplace=True
    )
    
    all_drugs_df_exploded_not_both[group_name] = group_data

all_drugs_df_new_both = pd.merge(
    all_drugs_df_exploded_not_both["left_only"],
    all_drugs_df_exploded_not_both["right_only"],
    on="Alternate_Names",
    how="inner",
    suffixes=["_x", ""],
)

all_drugs_df_new_both["_merge"] = "both"

all_drugs_df_new_both = (
    all_drugs_df_new_both.groupby(["Name", "Name_x"])
    .agg(
        {
            "PharmGKB Accession Id": lambda x: list(x)[0],
            "DrugBank ID": lambda x: list(x)[0],
            "Alternate_Names": lambda x: " ; ".join(str(name) for name in set(x)),
            "_merge": lambda x: list(x)[0],
        }
    )
    .reset_index()
)

all_drugs_df_new_both.drop_duplicates("Name", keep=False, inplace=True)
all_drugs_df_new_both.drop_duplicates("Name_x", keep=False, inplace=True)

all_drugs_df_new_both.nunique()

Name                     103
Name_x                   103
PharmGKB Accession Id    103
DrugBank ID              103
Alternate_Names          103
_merge                     1
dtype: int64

In [14]:
all_drugs_df["Name_x"] = all_drugs_df.Name

all_drugs_df = pd.concat([all_drugs_df, all_drugs_df_new_both])
all_drugs_df.sort_values("_merge", inplace=True)

all_drugs_df.drop_duplicates(subset=["Name"], inplace=True)
all_drugs_df.drop_duplicates(subset=["Name_x"], inplace=True)

all_drugs_df._merge.value_counts()

_merge
right_only    2310
both          1240
left_only      300
Name: count, dtype: int64

In [15]:
all_drugs_df.drop(columns=all_drugs_df.columns.difference(["DrugBank ID", "PharmGKB Accession Id", "Name", "Alternate_Names"]), inplace=True)
all_drugs_df["Id"] = all_drugs_df["PharmGKB Accession Id"].fillna(all_drugs_df["DrugBank ID"])
all_drugs_df.head()

Unnamed: 0,DrugBank ID,Name,PharmGKB Accession Id,Alternate_Names,Id
0,1,lepirudin,PA450195,hirudin variant-1 ; lepirudin,PA450195
1031,1237,bromodiphenhydramine,PA164760854,ambodryl hydrochloride ; amodryl ; bromanautin...,PA164760854
1030,1607,ticarcillin,PA451684,ticarcilina [inn-spanish] ; ticarcillin supple...,PA451684
1029,6707,levonordefrin,PA165958380,(-)-cobefrin ; alpha-methylnoradrenaline ; alp...,PA165958380
1027,351,megestrol,PA450351,mga ; megestrol acetate ; megestrolo [dcit] ; ...,PA450351


## Diseases Datasets

### Load Datasets

 #### Wikidata Dataset

In [16]:
diseases1_df = pd.read_csv("Wikidata/diseases.tsv", delimiter="\t")

# FORMAT
diseases1_df.Name = diseases1_df.Name.str.lower().str.strip()

diseases1_df.drop_duplicates(subset=["Disease Ontology ID", "Name"], inplace=True)

diseases1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 989 entries, 0 to 988
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Disease Ontology ID  989 non-null    object
 1   Name                 989 non-null    object
 2   Wikidata URL         989 non-null    object
dtypes: object(3)
memory usage: 23.3+ KB


In [17]:
diseases1_df.nunique().compare(diseases1_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count
Disease Ontology ID,988,989
Name,983,989
Wikidata URL,983,989


In [18]:
diseases1_df[diseases1_df["Disease Ontology ID"].duplicated(keep=False)].groupby("Disease Ontology ID").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,Disease Ontology ID,Name,Wikidata URL
Disease Ontology ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DOID:4029,281,DOID:4029,gastritis,http://www.wikidata.org/entity/Q183130
DOID:4029,984,DOID:4029,chronic gastritis,http://www.wikidata.org/entity/Q16958168


In [19]:
diseases1_df[diseases1_df.Name.duplicated(keep=False)].groupby("Name").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,Disease Ontology ID,Name,Wikidata URL
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cocaine dependence,537,DOID:809,cocaine dependence,http://www.wikidata.org/entity/Q3743188
cocaine dependence,538,DOID:9975,cocaine dependence,http://www.wikidata.org/entity/Q3743188
cryptococcal meningitis,726,DOID:0080159,cryptococcal meningitis,http://www.wikidata.org/entity/Q18967011
cryptococcal meningitis,727,DOID:12052,cryptococcal meningitis,http://www.wikidata.org/entity/Q18967011
dermatomycosis,269,DOID:0050134,dermatomycosis,http://www.wikidata.org/entity/Q3705876
dermatomycosis,270,DOID:1563,dermatomycosis,http://www.wikidata.org/entity/Q3705876
peripheral neuropathy,366,DOID:574,peripheral neuropathy,http://www.wikidata.org/entity/Q945238
peripheral neuropathy,367,DOID:870,peripheral neuropathy,http://www.wikidata.org/entity/Q945238
urinary tract infection,32,DOID:0080784,urinary tract infection,http://www.wikidata.org/entity/Q221668
urinary tract infection,33,DOID:13148,urinary tract infection,http://www.wikidata.org/entity/Q221668


In [20]:
diseases1_df_duplicated_ID = diseases1_df[diseases1_df["Disease Ontology ID"].duplicated(keep=False)].groupby("Disease Ontology ID").agg({"Name": lambda x: list(x.astype(str))})
diseases1_df_duplicated_ID["Alternate Names"] = diseases1_df_duplicated_ID.Name.apply(lambda x :" ; ".join(x))
diseases1_df_duplicated_ID = diseases1_df_duplicated_ID.explode("Name")

mapping_dict = diseases1_df_duplicated_ID.set_index("Name")["Alternate Names"].to_dict()
diseases1_df["Alternate Names"] = diseases1_df.Name.map(mapping_dict)
diseases1_df[diseases1_df["Disease Ontology ID"].duplicated(keep=False)].groupby("Disease Ontology ID").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,Disease Ontology ID,Name,Wikidata URL,Alternate Names
Disease Ontology ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DOID:4029,281,DOID:4029,gastritis,http://www.wikidata.org/entity/Q183130,gastritis ; chronic gastritis
DOID:4029,984,DOID:4029,chronic gastritis,http://www.wikidata.org/entity/Q16958168,gastritis ; chronic gastritis


In [21]:
diseases1_df_duplicated_Name_first = diseases1_df[diseases1_df.Name.duplicated(keep="last")].copy()

diseases1_df_duplicated_Name_others = diseases1_df[diseases1_df.Name.duplicated(keep=False) & -diseases1_df.Name.duplicated(keep="last")].copy()

diseases1_df_duplicated_Name_others["other"] = diseases1_df_duplicated_Name_others["Disease Ontology ID"]
diseases1_df_duplicated_Name_others.drop(columns=["Disease Ontology ID"], inplace=True)

diseases1_df_duplicated_Name = pd.concat([diseases1_df_duplicated_Name_first, diseases1_df_duplicated_Name_others]).groupby("Name").agg({"Disease Ontology ID": lambda x: list(x)[0], "other": lambda x: list(x)[1:]}).explode("other")
diseases1_df_duplicated_Name

Unnamed: 0_level_0,Disease Ontology ID,other
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
cocaine dependence,DOID:809,DOID:9975
cryptococcal meningitis,DOID:0080159,DOID:12052
dermatomycosis,DOID:0050134,DOID:1563
peripheral neuropathy,DOID:574,DOID:870
urinary tract infection,DOID:0080784,DOID:13148
waldenström macroglobulinemia,DOID:0050747,DOID:0060901


In [22]:
# REMOVE DUPLICATES
diseases1_df.drop_duplicates(subset=["Disease Ontology ID"], inplace=True)
diseases1_df.drop_duplicates(subset=["Name"], inplace=True)

diseases1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 982 entries, 0 to 988
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Disease Ontology ID  982 non-null    object
 1   Name                 982 non-null    object
 2   Wikidata URL         982 non-null    object
 3   Alternate Names      1 non-null      object
dtypes: object(4)
memory usage: 38.4+ KB


#### PharmGKB Dataset

In [23]:
diseases2_df = pd.read_csv(
    "PharmGKB/primary_data/phenotypes/phenotypes.tsv", delimiter="\t"
)

# Format
diseases2_df.Name = diseases2_df.Name.str.lower().str.strip()

diseases2_df.drop_duplicates(subset=["PharmGKB Accession Id", "Name"], inplace=True)

diseases2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3587 entries, 0 to 3586
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   PharmGKB Accession Id  3587 non-null   object
 1   Name                   3587 non-null   object
 2   Alternate Names        2472 non-null   object
 3   Cross-references       1 non-null      object
 4   External Vocabulary    3441 non-null   object
dtypes: object(5)
memory usage: 140.2+ KB


In [24]:
diseases2_df.nunique().compare(diseases2_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count
Name,3586,3587
Alternate Names,2471,2472
External Vocabulary,3433,3441


In [25]:
diseases2_df[diseases2_df.Name.duplicated(keep=False)].groupby("Name").apply(lambda x: x)

Unnamed: 0_level_0,Unnamed: 1_level_0,PharmGKB Accession Id,Name,Alternate Names,Cross-references,External Vocabulary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hypersexuality state,1604,PA166232142,hypersexuality state,Compulsive sexual behaviour,,SnoMedCT:73744004(Hypersexuality state)
hypersexuality state,1605,PA166232141,hypersexuality state,Compulsive sexual behaviour,,SnoMedCT:73744004(Hypersexuality state)


In [26]:
diseases2_df_duplicated_Name_first = diseases2_df[diseases2_df.Name.duplicated(keep="last")].copy()

diseases2_df_duplicated_Name_others = diseases2_df[diseases2_df.Name.duplicated(keep=False) & -diseases2_df.Name.duplicated(keep="last")].copy()

diseases2_df_duplicated_Name_others["other"] = diseases2_df_duplicated_Name_others["PharmGKB Accession Id"]
diseases2_df_duplicated_Name_others.drop(columns=["PharmGKB Accession Id"], inplace=True)

diseases2_df_duplicated_Name = pd.concat([diseases2_df_duplicated_Name_first, diseases2_df_duplicated_Name_others]).groupby("Name").agg({"PharmGKB Accession Id": lambda x: list(x)[0], "other": lambda x: list(x)[1:]}).explode("other")
diseases2_df_duplicated_Name

Unnamed: 0_level_0,PharmGKB Accession Id,other
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
hypersexuality state,PA166232142,PA166232141


In [27]:
# REMOVE DUPLICATES
diseases2_df.drop_duplicates(subset=["PharmGKB Accession Id"], inplace=True)
diseases2_df.drop_duplicates(subset=["Name"], inplace=True)

diseases2_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3586 entries, 0 to 3586
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   PharmGKB Accession Id  3586 non-null   object
 1   Name                   3586 non-null   object
 2   Alternate Names        2471 non-null   object
 3   Cross-references       1 non-null      object
 4   External Vocabulary    3440 non-null   object
dtypes: object(5)
memory usage: 168.1+ KB


### Merge Datasets

#### 1<sup>st</sup> Strategy

In [28]:
all_diseases_df = pd.merge(diseases1_df, diseases2_df, on="Name", how="outer", indicator=True, suffixes=["_x", ""])

all_diseases_df["Alternate_Names"] = (
    all_diseases_df["Alternate Names"]
        .apply(format_semicolon_as_separator)
        .fillna(all_diseases_df["Alternate Names_x"])
        .add(' ; ')
        .fillna("")
        .add(all_diseases_df.Name)
        .str.strip()
        .str.replace('"', '')
        .str.lower()
    )
all_diseases_df.drop(columns=all_diseases_df.columns.difference(["Disease Ontology ID", "PharmGKB Accession Id", "Name", "Alternate_Names", "_merge"]), inplace=True)

all_diseases_df._merge.value_counts()

_merge
right_only    3270
left_only      666
both           316
Name: count, dtype: int64

#### 2<sup>nd</sup> Strategy

In [29]:
all_diseases_df_exploded_not_both: dict[str, pd.DataFrame] = {group_name: group_data for group_name, group_data in all_diseases_df.groupby("_merge") if group_name != "both"}

columns = {"left_only": ["Name", "Disease Ontology ID", "Alternate_Names"], "right_only": ["Name", "PharmGKB Accession Id", "Alternate_Names"]}

    
all_diseases_df_exploded_not_both["right_only"].Alternate_Names = all_diseases_df_exploded_not_both["right_only"].Alternate_Names.str.split(" ; ")
all_diseases_df_exploded_not_both["right_only"] = all_diseases_df_exploded_not_both["right_only"].explode("Alternate_Names")

for group_name, group_data in all_diseases_df_exploded_not_both.items():
    group_data.Alternate_Names = group_data.Alternate_Names.str.strip()

    group_data.drop_duplicates(subset=["Alternate_Names"], inplace=True)
    group_data.drop(columns=all_diseases_df.columns.difference(columns[group_name]), inplace=True)
    
    all_diseases_df_exploded_not_both[group_name] = group_data

all_diseases_df_new_both = pd.merge(all_diseases_df_exploded_not_both["left_only"], all_diseases_df_exploded_not_both["right_only"], on="Alternate_Names", how="inner", suffixes=["_x", ""])

all_diseases_df_new_both["_merge"] = "both"

all_diseases_df_new_both = all_diseases_df_new_both.groupby(["Name", "Name_x"]).agg({
    "PharmGKB Accession Id": lambda x: list(x)[0],
    "Disease Ontology ID": lambda x: list(x)[0],
    "Alternate_Names": lambda x: " ; ".join(str(name) for name in set(x)),
    "_merge": lambda x: list(x)[0]
}).reset_index()

all_diseases_df_new_both.drop_duplicates("Name", keep=False, inplace=True)
all_diseases_df_new_both.drop_duplicates("Name_x", keep=False, inplace=True)

all_diseases_df_new_both.nunique()

Name                     241
Name_x                   241
PharmGKB Accession Id    241
Disease Ontology ID      241
Alternate_Names          241
_merge                     1
dtype: int64

In [30]:
all_diseases_df["Name_x"] = all_diseases_df.Name

all_diseases_df = pd.concat([all_diseases_df, all_diseases_df_new_both])

all_diseases_df.sort_values("_merge", inplace=True)
all_diseases_df.drop_duplicates(subset=["Name"], keep="first", inplace=True)
all_diseases_df.drop_duplicates(subset=["Name_x"], keep="first", inplace=True)

all_diseases_df._merge.value_counts()

_merge
right_only    3029
both           557
left_only      425
Name: count, dtype: int64

In [31]:
all_diseases_df.drop(columns=all_diseases_df.columns.difference(["Disease Ontology ID", "PharmGKB Accession Id", "Name", "Alternate_Names"]), inplace=True)
all_diseases_df["Id"] = all_diseases_df["PharmGKB Accession Id"].fillna(all_diseases_df["Disease Ontology ID"])

all_diseases_df.head()

Unnamed: 0,Disease Ontology ID,Name,PharmGKB Accession Id,Alternate_Names,Id
0,DOID:552,pneumonia,PA445355,experimental lung inflammation ; experimental ...,PA445355
883,DOID:865,vasculitis,PA446023,"angiitides ; angiitis ; angiitis, nos ; vascul...",PA446023
888,DOID:9784,trichinosis,PA445920,infection by larvae of trichinella spiralis ; ...,PA445920
890,DOID:3068,glioblastoma,PA444283,"astrocytoma, grade iv ; astrocytomas, grade iv...",PA444283
891,DOID:420,hypertrichosis,PA444560,excessive hair growth ; excessive hairiness ; ...,PA444560


## Variants & Genes Datasets

### Variants Dataset

In [32]:
variants_df = pd.read_csv('PharmGKB/primary_data/variants/variants.tsv', delimiter='\t')
variants_df.head()

Unnamed: 0,Variant ID,Variant Name,Gene IDs,Gene Symbols,Location,Variant Annotation count,Clinical Annotation count,Level 1/2 Clinical Annotation count,Guideline Annotation count,Label Annotation count,Synonyms
0,PA166156302,rs1000002,PA395,ABCC5,NC_000003.12:183917980,1,0,0,0,0,"rs17623022, NG_047115.1:g.105031=, NC_000003.1..."
1,PA166156746,rs1000113,PA142671652,IRGM,NC_000005.10:150860514,1,0,0,0,0,"1000113, NC_000005.9:g.150240076=, NC_000005.9..."
2,PA166195421,rs10006452,PA361,UGT2B7,NC_000004.12:69112090,1,0,0,0,0,"10006452, NC_000004.12:g.69112090T>A, 58882597..."
3,PA166177121,rs10007051,,,NC_000004.12:129244309,1,1,0,0,0,"NC_000004.11:g.130165464=, NC_000004.12:g.1292..."
4,PA166156636,rs10008257,,,NC_000004.12:94435177,2,0,0,0,0,"10008257, NC_000004.12:g.94435177=, rs10008257..."


In [33]:
variants_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6797 entries, 0 to 6796
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Variant ID                           6797 non-null   object
 1   Variant Name                         6797 non-null   object
 2   Gene IDs                             6151 non-null   object
 3   Gene Symbols                         6151 non-null   object
 4   Location                             6788 non-null   object
 5   Variant Annotation count             6797 non-null   int64 
 6   Clinical Annotation count            6797 non-null   int64 
 7   Level 1/2 Clinical Annotation count  6797 non-null   int64 
 8   Guideline Annotation count           6797 non-null   int64 
 9   Label Annotation count               6797 non-null   int64 
 10  Synonyms                             6797 non-null   object
dtypes: int64(5), object(6)
memory usage: 584.2+

In [34]:
variants_df.nunique().compare(variants_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count
Gene IDs,1963,6151
Gene Symbols,1963,6151
Location,6787,6788
Variant Annotation count,109,6797
Clinical Annotation count,47,6797
Level 1/2 Clinical Annotation count,32,6797
Guideline Annotation count,6,6797
Label Annotation count,10,6797


In [35]:
all_variants_df = pd.DataFrame({
    'Id': variants_df['Variant ID'],
    'Name': variants_df['Variant Name'],
    'Alternate_Names': variants_df['Synonyms']
})
all_variants_df.drop_duplicates(keep="first", inplace=True)
all_variants_df.head()

Unnamed: 0,Id,Name,Alternate_Names
0,PA166156302,rs1000002,"rs17623022, NG_047115.1:g.105031=, NC_000003.1..."
1,PA166156746,rs1000113,"1000113, NC_000005.9:g.150240076=, NC_000005.9..."
2,PA166195421,rs10006452,"10006452, NC_000004.12:g.69112090T>A, 58882597..."
3,PA166177121,rs10007051,"NC_000004.11:g.130165464=, NC_000004.12:g.1292..."
4,PA166156636,rs10008257,"10008257, NC_000004.12:g.94435177=, rs10008257..."


In [36]:
all_variants_df.count()

Id                 6797
Name               6797
Alternate_Names    6797
dtype: int64

### Genes Dataset

In [37]:
variants_df_exploded = variants_df.copy()
variants_df_exploded.loc[:, "Gene IDs"] = variants_df_exploded["Gene IDs"].str.split(",")
variants_df_exploded.loc[:, "Gene Symbols"] = variants_df_exploded["Gene Symbols"].str.split(",")
variants_df_exploded = variants_df_exploded.explode(["Gene IDs", "Gene Symbols"])

all_genes_df = pd.DataFrame({
    'Id': variants_df_exploded['Gene IDs'],
    'Name': variants_df_exploded['Gene Symbols'],
    'Alternate_Names': variants_df_exploded['Gene Symbols']
})

all_genes_df.drop_duplicates(keep="first", inplace=True)
all_genes_df.dropna(inplace=True)

all_genes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2046 entries, 0 to 6759
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               2046 non-null   object
 1   Name             2046 non-null   object
 2   Alternate_Names  2046 non-null   object
dtypes: object(3)
memory usage: 63.9+ KB


In [38]:
all_genes_df.nunique().compare(all_genes_df.count(), keep_equal=True, result_names=("nunique", "count"))

Unnamed: 0,nunique,count


## Nodes & Edges

### Nodes

In [39]:
all_diseases_df["Category"] = "Disease"
all_drugs_df["Category"] = "Chemical"
all_variants_df["Category"] = "Variant"
all_genes_df["Category"] = "Gene"

nodes = pd.concat([all_drugs_df, all_diseases_df, all_variants_df, all_genes_df])

nodes = nodes[["Id", "Name", "Alternate_Names", "Category"]]

nodes.rename(columns={"Name": "Label"}, inplace=True)

nodes.head()

Unnamed: 0,Id,Label,Alternate_Names,Category
0,PA450195,lepirudin,hirudin variant-1 ; lepirudin,Chemical
1031,PA164760854,bromodiphenhydramine,ambodryl hydrochloride ; amodryl ; bromanautin...,Chemical
1030,PA451684,ticarcillin,ticarcilina [inn-spanish] ; ticarcillin supple...,Chemical
1029,PA165958380,levonordefrin,(-)-cobefrin ; alpha-methylnoradrenaline ; alp...,Chemical
1027,PA450351,megestrol,mga ; megestrol acetate ; megestrolo [dcit] ; ...,Chemical


In [40]:
nodes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16704 entries, 0 to 6759
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               16704 non-null  object
 1   Label            16704 non-null  object
 2   Alternate_Names  16704 non-null  object
 3   Category         16704 non-null  object
dtypes: object(4)
memory usage: 652.5+ KB


In [41]:
nodes.Category.value_counts()

Category
Variant     6797
Disease     4011
Chemical    3850
Gene        2046
Name: count, dtype: int64

### Edges

#### Wikidata

In [42]:
relationships = pd.read_csv("Wikidata/relationships.tsv", delimiter="\t").dropna(subset=["Entity1_id", "Entity2_id"])
relationships.drop_duplicates(keep="first", inplace=True)
relationships.dropna(subset=["Entity1_id"], inplace=True)
relationships.dropna(subset=["Entity2_id"], inplace=True)
relationships.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6294 entries, 0 to 7625
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Entity1_id    6294 non-null   object
 1   Entity1_name  6294 non-null   object
 2   Entity1_type  6294 non-null   object
 3   Entity2_id    6294 non-null   object
 4   Entity2_name  6294 non-null   object
 5   Entity2_type  6294 non-null   object
 6   Category      6294 non-null   object
dtypes: object(7)
memory usage: 393.4+ KB


In [43]:
dbid_to_id = all_drugs_df[["DrugBank ID", "Id"]].set_index("DrugBank ID").Id.to_dict()
doid_to_id = (
    all_diseases_df[["Disease Ontology ID", "Id"]]
    .set_index("Disease Ontology ID")
    .Id.to_dict()
)

diseases1_df_duplicated_Name["Id"] = diseases1_df_duplicated_Name["Disease Ontology ID"].map(doid_to_id)
diseases1_df_duplicated_Name["Disease Ontology ID"] = diseases1_df_duplicated_Name.other

doid_to_id.update(diseases1_df_duplicated_Name[["Disease Ontology ID", "Id"]].set_index("Disease Ontology ID").Id.to_dict())

relationships["Source"] = relationships.Entity1_id.map(dbid_to_id)
relationships["Target"] = pd.concat(
    [
        relationships[relationships.Entity2_type == "Chemical"].Entity2_id.map(
            dbid_to_id
        ),
        relationships[relationships.Entity2_type == "Disease"].Entity2_id.map(
            doid_to_id
        ),
    ]
)


relationships["Type"] = "Dirigé"
relationships = relationships[["Source", "Target", "Type", "Category"]]

relationships.drop_duplicates(keep="first", inplace=True)

relationships.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6200 entries, 0 to 7625
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    6200 non-null   object
 1   Target    6200 non-null   object
 2   Type      6200 non-null   object
 3   Category  6200 non-null   object
dtypes: object(4)
memory usage: 242.2+ KB


In [44]:
relationships.Category.value_counts()

Category
treats            4804
interacts with    1396
Name: count, dtype: int64

#### PharmGKB

In [45]:
variant_gene_edges =  pd.DataFrame(
        {
            "Source": variants_df_exploded["Variant ID"],
            "Target": variants_df_exploded["Gene IDs"],
            "Type": "Dirigé",
            "Category": "Association",
        }
    )
variant_gene_edges.drop_duplicates(keep="first", inplace=True)
variant_gene_edges.dropna(subset=["Source"], inplace=True)
variant_gene_edges.dropna(subset=["Target"], inplace=True)
variant_gene_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7107 entries, 0 to 6796
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    7107 non-null   object
 1   Target    7107 non-null   object
 2   Type      7107 non-null   object
 3   Category  7107 non-null   object
dtypes: object(4)
memory usage: 277.6+ KB


In [46]:
clinical_annotations_variants = pd.read_csv('PharmGKB/annotations/clinicalVariants/clinicalVariants.tsv', delimiter="\t")
clinical_annotations_variants.drop_duplicates(keep="first", inplace=True)

clinical_annotations_variants = clinical_annotations_variants[clinical_annotations_variants["level of evidence"].isin({"1A", "1B", "2A"})]
clinical_annotations_variants.drop(columns=["level of evidence"], inplace=True)

clinical_annotations_variants.info()

<class 'pandas.core.frame.DataFrame'>
Index: 347 entries, 0 to 348
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   variant     347 non-null    object
 1   gene        346 non-null    object
 2   type        347 non-null    object
 3   chemicals   347 non-null    object
 4   phenotypes  270 non-null    object
dtypes: object(5)
memory usage: 16.3+ KB


In [47]:
clinical_annotations_variants_exploded = clinical_annotations_variants.copy()

clinical_annotations_variants_exploded.chemicals = clinical_annotations_variants_exploded.chemicals.str.lower().str.split(',')
clinical_annotations_variants_exploded.phenotypes = clinical_annotations_variants_exploded.phenotypes.str.lower().str.split(',')

clinical_annotations_variants_exploded = clinical_annotations_variants_exploded.explode(column="chemicals").explode(column="phenotypes")
clinical_annotations_variants_exploded.rename(columns={"chemicals" : "chemical", "phenotypes" : "phenotype"}, inplace=True)

clinical_annotations_variants_exploded.drop_duplicates(keep="first", inplace=True)

clinical_annotations_variants_exploded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 776 entries, 0 to 348
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   variant    776 non-null    object
 1   gene       775 non-null    object
 2   type       776 non-null    object
 3   chemical   776 non-null    object
 4   phenotype  695 non-null    object
dtypes: object(5)
memory usage: 36.4+ KB


In [48]:
variant_label_to_id = variants_df_exploded[["Variant ID", "Variant Name"]].set_index("Variant Name")["Variant ID"].to_dict()

gene_label_to_id = variants_df_exploded[["Gene IDs", "Gene Symbols"]].dropna()

gene_label_to_id = gene_label_to_id[["Gene IDs", "Gene Symbols"]].set_index("Gene Symbols")["Gene IDs"].to_dict()

drug_label_to_id = drugs2_df[["PharmGKB Accession Id", "Name"]].set_index("Name")["PharmGKB Accession Id"].to_dict()

phenotype_label_to_id = diseases2_df[["PharmGKB Accession Id", "Name"]].set_index("Name")["PharmGKB Accession Id"].to_dict()

clinical_annotations_variants_exploded["variant Id"] = clinical_annotations_variants_exploded.variant.map(variant_label_to_id).fillna(clinical_annotations_variants_exploded.gene.map(gene_label_to_id))

clinical_annotations_variants_exploded["chemical Id"] = clinical_annotations_variants_exploded.chemical.map(drug_label_to_id)

clinical_annotations_variants_exploded["phenotype Id"] = clinical_annotations_variants_exploded.phenotype.map(phenotype_label_to_id)

clinical_annotations_variants_exploded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 776 entries, 0 to 348
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   variant       776 non-null    object
 1   gene          775 non-null    object
 2   type          776 non-null    object
 3   chemical      776 non-null    object
 4   phenotype     695 non-null    object
 5   variant Id    776 non-null    object
 6   chemical Id   771 non-null    object
 7   phenotype Id  633 non-null    object
dtypes: object(8)
memory usage: 54.6+ KB


In [49]:
clinical_annotations_variants_with_phenotype = clinical_annotations_variants_exploded[clinical_annotations_variants_exploded.phenotype.notna()].copy()
clinical_annotations_variants_with_phenotype["variant+chemical"] = clinical_annotations_variants_with_phenotype.variant.astype(str).add("+").add(clinical_annotations_variants_with_phenotype.chemical)
clinical_annotations_variants_with_phenotype["variant+chemical Id"] = clinical_annotations_variants_with_phenotype["variant Id"].astype(str).add("+").add(clinical_annotations_variants_with_phenotype["chemical Id"])
clinical_annotations_variants_with_phenotype.info()

<class 'pandas.core.frame.DataFrame'>
Index: 695 entries, 3 to 348
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   variant              695 non-null    object
 1   gene                 695 non-null    object
 2   type                 695 non-null    object
 3   chemical             695 non-null    object
 4   phenotype            695 non-null    object
 5   variant Id           695 non-null    object
 6   chemical Id          695 non-null    object
 7   phenotype Id         633 non-null    object
 8   variant+chemical     695 non-null    object
 9   variant+chemical Id  695 non-null    object
dtypes: object(10)
memory usage: 59.7+ KB


In [50]:
intermediate_nodes = pd.DataFrame({
    "Id": clinical_annotations_variants_with_phenotype["variant+chemical Id"],
    "Label": clinical_annotations_variants_with_phenotype["variant+chemical"],
    "Alternate_Names": clinical_annotations_variants_with_phenotype["variant+chemical"], 
    "Category": "Variant+Chemical",})

intermediate_nodes.drop_duplicates(subset=["Id"], inplace=True)
intermediate_nodes.drop_duplicates(subset=["Label"], inplace=True)
intermediate_nodes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 3 to 348
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               494 non-null    object
 1   Label            494 non-null    object
 2   Alternate_Names  494 non-null    object
 3   Category         494 non-null    object
dtypes: object(4)
memory usage: 19.3+ KB


In [51]:
nodes = pd.concat([nodes, intermediate_nodes], ignore_index=True)
nodes.Category.value_counts()

Category
Variant             6797
Disease             4011
Chemical            3850
Gene                2046
Variant+Chemical     494
Name: count, dtype: int64

In [52]:
variant_to_intermediate_edges = pd.DataFrame({
    "Source": clinical_annotations_variants_with_phenotype["variant Id"],
    "Target": clinical_annotations_variants_with_phenotype["variant+chemical Id"], 
    "Type": "Dirigé", 
    "Category": "+"
})
variant_to_intermediate_edges.drop_duplicates(subset=["Target"], keep="first", inplace=True)
variant_to_intermediate_edges.dropna(subset=["Source"], inplace=True)
variant_to_intermediate_edges.dropna(subset=["Target"], inplace=True)
variant_to_intermediate_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 3 to 348
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    494 non-null    object
 1   Target    494 non-null    object
 2   Type      494 non-null    object
 3   Category  494 non-null    object
dtypes: object(4)
memory usage: 19.3+ KB


In [53]:
chemical_to_intermediate_edges = pd.DataFrame({
    "Source": clinical_annotations_variants_with_phenotype["chemical Id"],
    "Target": clinical_annotations_variants_with_phenotype["variant+chemical Id"], 
    "Type": "Dirigé", 
    "Category": "+"
})
chemical_to_intermediate_edges.drop_duplicates(subset=["Target"], keep="first", inplace=True)
chemical_to_intermediate_edges.dropna(subset=["Source"], inplace=True)
chemical_to_intermediate_edges.dropna(subset=["Target"], inplace=True)
chemical_to_intermediate_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 494 entries, 3 to 348
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    494 non-null    object
 1   Target    494 non-null    object
 2   Type      494 non-null    object
 3   Category  494 non-null    object
dtypes: object(4)
memory usage: 19.3+ KB


In [54]:
intermediate_to_phenotype_edges = pd.DataFrame({
    "Source": clinical_annotations_variants_with_phenotype["variant+chemical Id"],
    "Target": clinical_annotations_variants_with_phenotype["phenotype Id"], 
    "Type": "Dirigé", 
    "Category": clinical_annotations_variants_with_phenotype["type"]
})
intermediate_to_phenotype_edges.drop_duplicates(keep="first", inplace=True)
intermediate_to_phenotype_edges.dropna(subset=["Source"], inplace=True)
intermediate_to_phenotype_edges.dropna(subset=["Target"], inplace=True)
intermediate_to_phenotype_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 627 entries, 3 to 348
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    627 non-null    object
 1   Target    627 non-null    object
 2   Type      627 non-null    object
 3   Category  627 non-null    object
dtypes: object(4)
memory usage: 24.5+ KB


In [55]:
clinical_annotations_variants = clinical_annotations_variants_exploded[clinical_annotations_variants_exploded.phenotype.isna()].copy()
clinical_annotations_variants.drop(columns="phenotype", inplace=True)
clinical_annotations_variants.count()

variant         81
gene            80
type            81
chemical        81
variant Id      81
chemical Id     76
phenotype Id     0
dtype: int64

In [56]:
variant_to_chemical_edges = pd.DataFrame({
    "Source": clinical_annotations_variants["variant Id"],
    "Target": clinical_annotations_variants["chemical Id"], 
    "Type": "Dirigé", 
    "Category": clinical_annotations_variants["type"]
})
variant_to_chemical_edges.drop_duplicates(keep="first", inplace=True)
variant_to_chemical_edges.dropna(subset=["Source"], inplace=True)
variant_to_chemical_edges.dropna(subset=["Target"], inplace=True)
variant_to_chemical_edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76 entries, 0 to 338
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    76 non-null     object
 1   Target    76 non-null     object
 2   Type      76 non-null     object
 3   Category  76 non-null     object
dtypes: object(4)
memory usage: 3.0+ KB


In [57]:
edges = pd.concat([relationships, variant_gene_edges, variant_to_chemical_edges, variant_to_intermediate_edges, chemical_to_intermediate_edges, intermediate_to_phenotype_edges])
edges.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14998 entries, 0 to 348
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Source    14998 non-null  object
 1   Target    14998 non-null  object
 2   Type      14998 non-null  object
 3   Category  14998 non-null  object
dtypes: object(4)
memory usage: 585.9+ KB


In [58]:
edges.Category.value_counts()

Category
Association          7107
treats               4804
interacts with       1396
+                     988
Toxicity              473
Metabolism/PK          99
Efficacy               80
Dosage                 33
Other                  15
Efficacy,Toxicity       3
Name: count, dtype: int64

In [59]:
# Save nodes to CSV
nodes.to_csv('nodes.csv', index=False)

# Save edges to CSV
edges.to_csv('edges.csv', index=False)
