#Map names of drugs form drug banks data set original to map to out drugs using id drugs to assign

In [None]:
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
drugbank_vocab = pd.read_csv('/content/drive/MyDrive/MLHygnn/DB/drugbank_vocabulary.csv')
unique_drugs_smiles = pd.read_csv('/content/drive/MyDrive/MLHygnn/DB/Drugs_With_SMILES.csv')

# Display basic info about the datasets
print("DrugBank Vocabulary shape:", drugbank_vocab.shape)
print("Unique drugs with SMILES shape:", unique_drugs_smiles.shape)

# Display column names to verify
print("\nDrugBank Vocabulary columns:", drugbank_vocab.columns.tolist())
print("Unique drugs SMILES columns:", unique_drugs_smiles.columns.tolist())

# Clean column names (remove any extra spaces)
drugbank_vocab.columns = drugbank_vocab.columns.str.strip()
unique_drugs_smiles.columns = unique_drugs_smiles.columns.str.strip()

# Merge the two dataframes on DrugBank_ID
final_table = pd.merge(
    unique_drugs_smiles,
    drugbank_vocab,
    left_on='DrugBank_ID',
    right_on='DrugBank ID',
    how='left'
)

# Select and rename columns to get: Drug ID | Name | SMILES
final_table = final_table[['DrugBank_ID', 'Common name', 'SMILES']]
final_table.columns = ['Drug ID', 'Name', 'SMILES']

# Display the result
print(f"\nFinal table shape: {final_table.shape}")
print("\nFirst few rows:")
print(final_table.head(10))

# Check for any missing names
missing_names = final_table['Name'].isna().sum()
print(f"\nNumber of drugs with missing names: {missing_names}")

# Save the final table
final_table.to_csv('/content/drive/MyDrive/final_drug_table.csv', index=False)
print("\nFinal table saved as 'final_drug_table.csv' in your Google Drive!")

# Optional: Display some statistics
print(f"\nTotal drugs in final table: {len(final_table)}")
print(f"Drugs with complete data: {final_table.dropna().shape[0]}")

DrugBank Vocabulary shape: (16575, 2)
Unique drugs with SMILES shape: (1709, 2)

DrugBank Vocabulary columns: ['DrugBank ID', 'Common name']
Unique drugs SMILES columns: ['DrugBank_ID', 'SMILES']

Final table shape: (1709, 3)

First few rows:
   Drug ID            Name                                             SMILES
0  DB00006     Bivalirudin  CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...
1  DB00014       Goserelin  CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
2  DB00027    Gramicidin D  CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
3  DB00035    Desmopressin  NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
4  DB00080      Daptomycin  CCCCCCCCCC(=O)N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)...
5  DB00091    Cyclosporine  CC[C@@H]1NC(=O)[C@H]([C@H](O)[C@H](C)C\C=C\C)N...
6  DB00104      Octreotide  [H][C@]1(NC(=O)[C@H](CCCCN)NC(=O)[C@@H](CC2=CN...
7  DB00115  Cyanocobalamin  C[C@H](CNC(=O)CC[C@]1(C)[C@@H](CC(N)=O)[C@H]2N...
8  DB00120   Phenylalanine                        N[C@@

In [None]:
# Display drugs with missing names
missing_drugs = final_table[final_table['Name'].isna()]
print("Drugs with missing names:")
print(missing_drugs)

Drugs with missing names:
Empty DataFrame
Columns: [Drug ID, Name, SMILES]
Index: []


In [None]:
# Corrected manual update for the 5 missing drug names
manual_drug_names = {
    'DB09162': 'Ferric Citrate',
    'DB09323': 'Benzylpenicillin benzathine hydrate',
    'DB09396': 'Dextropropoxyphene napsylate',
    'DB11106': 'Pamabrom',
    'DB13450': 'Cisatracurium besylate'
}

# Update the final table with manual names
for drug_id, name in manual_drug_names.items():
    final_table.loc[final_table['Drug ID'] == drug_id, 'Name'] = name

# Verify the updates
print("\nUpdated drugs:")
updated_drugs = final_table[final_table['Drug ID'].isin(manual_drug_names.keys())]
print(updated_drugs[['Drug ID', 'Name']])

# Check for any remaining missing names
missing_count = final_table['Name'].isna().sum()
print(f"\n Drugs with missing names: {missing_count}")

# Save the fully updated table
final_table.to_csv('/content/drive/MyDrive/final_drug_table_complete.csv', index=False)
print(f"\n Successfully saved complete table with all {len(final_table)} drug names!")