In [1]:
#!pip install pubchempy
import pandas as pd
import pubchempy as pcp

# Import Python libraries for file operations, data manipulation, and chemical information retrieval.
# PubChemPy is used to access chemical data from the PubChem database.

pub_df = pd.read_csv('Drug_listTue_Aug10_2021.csv')

# Read the CSV file into a Pandas DataFrame named 'pub_df'.

# We clean the data by removing rows with missing values or with 'none' or 'several' in the 'PubCHEM' column
pub_df = pub_df.dropna(subset=['PubCHEM'])
pub_df = pub_df[(pub_df['PubCHEM'] != 'none')]

# Initializing lists to store SMILES and InChI representations
smile_list = []
inchi_list = []

# Create empty lists to store SMILES and InChI representations for each compound.

# We extract the PubChem IDs from the 'PubCHEM' column.
# Note that some drugs have multiple IDs separated by commas. 
# In this case, we use the first one.
for idx, row in pub_df.iterrows():
    pubids = row['PubCHEM'].split(',')
    pubid = pubids[0]
    
    compound = pcp.Compound.from_cid(pubid)

# We extract the SMILES and inchi representations of the compound using pubchempy
    smile = compound.isomeric_smiles
    inchi = compound.inchi


# We append the SMILES and InChi to their respective lists
    smile_list.append(smile)
    inchi_list.append(inchi)

# Adding the SMILES and inchi columns to our DataFrame 
pub_df['smiles'] = smile_list
pub_df['inchi'] = inchi_list

# We then save the updated DataFrame to a new CSV omitting the index column.
output_csv_file = 'SMILEinchi.csv'
pub_df.to_csv(output_csv_file, index=False)

print(f"Data processing completed. Results saved to '{output_csv_file}'.")

Data processing completed. Results saved to 'SMILEinchi.csv'.
