In [28]:
import os
import pandas as pd

PubChemDF = pd.DataFrame()
PCBA_ID = []

for filename in os.listdir(os.getcwd()):
    if filename.startswith('AID'):
        assay_id = filename.rsplit('_')[1]
        df = pd.read_csv(filename, skiprows = list(range(1,6)))#Skip rows with column description
        df = df.iloc[:,2:5] #Select needed columns
        df.columns = ['CID', assay_id + '_OUTCOME', assay_id + '_SCORE']
        PubChemDF = pd.concat([PubChemDF, df]).groupby('CID', as_index=False, sort=False).first().fillna('NA')
        PCBA_ID.append(assay_id)

  interactivity=interactivity, compiler=compiler, result=result)


In [29]:
PubChemDF.head()

Unnamed: 0,CID,1159524_OUTCOME,1159524_SCORE,588856_OUTCOME,588856_SCORE,602332_OUTCOME,602332_SCORE,624170_OUTCOME,624170_SCORE,624173_OUTCOME,...,624263_OUTCOME,624263_SCORE,652048_OUTCOME,652048_SCORE,686978_OUTCOME,686978_SCORE,686979_OUTCOME,686979_SCORE,743266_OUTCOME,743266_SCORE
0,6603008.0,Inactive,0.0,Inactive,0,Inactive,0,Inactive,0,Inactive,...,Inactive,0.0,,,Inactive,0,Inactive,0,Inactive,0
1,6602571.0,,,Inactive,0,Inactive,0,Inactive,0,Inactive,...,Inactive,0.0,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0
2,6602616.0,,,Inactive,0,Inactive,0,Inactive,0,Inactive,...,,,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0
3,644371.0,,,Inactive,0,Inactive,0,Inactive,0,Inactive,...,Inactive,0.0,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0
4,6603132.0,Inactive,0.0,Inactive,0,Inactive,0,Inactive,0,Inactive,...,Inactive,0.0,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0


In [30]:
PubChemDF.shape

(424059, 21)

In [31]:
import pubchempy as pcp
from rdkit import Chem
import molvs as mv

def get_parent_smile(cid):
    try:
        cpd = pcp.Compound.from_cid(int(cid))
        smi = cpd.isomeric_smiles
        st = mv.Standardizer()
        mol = Chem.MolFromSmiles(smi)
        smts = Chem.MolFromSmarts("[!#1&!#5&!#6&!#7&!#8&!#9&!#14&!#15&!#16&!#17&!#34&!#35&!#53]~*")
        if mol.HasSubstructMatch(smts) == False:
            mols = st.charge_parent(mol)
            return Chem.MolToSmiles(mols)
    except:
        return 'NaN'

#Generate InchiKey
def inchikey_gen(smile):
    try:
        m = Chem.MolFromSmiles(smile)
        m_ = Chem.MolToInchi(m)
        m__ = Chem.InchiToInchiKey(m_)
        return m__
    except:
        return 'NaN'
    
#Clean and standardize the data
def clean_data(data):
    #Standardize and get parent with molvs
    data["smiles_parent"] = data['CID'].apply(get_parent_smile)
    #data = data[data['smiles_parent'].notnull()]
    data = data[~(data['smiles_parent'] == "NaN")]
    
    data["inchi"] = data['smiles_parent'].apply(inchikey_gen)
    data = data[~(data['inchi'] == "NaN")]
    data = data.drop('CID', axis = 1)
    return data

In [32]:
PubChemDF = clean_data(PubChemDF)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
PubChemDF.head()

In [None]:
#PubChemDF.to_csv('/ihome/gidakwo/ml_files/SMILES/PubChem/PubChemDF.csv', index=False)

In [33]:
PubChemDF.head()

Unnamed: 0,1159524_OUTCOME,1159524_SCORE,588856_OUTCOME,588856_SCORE,602332_OUTCOME,602332_SCORE,624170_OUTCOME,624170_SCORE,624173_OUTCOME,624173_SCORE,...,652048_OUTCOME,652048_SCORE,686978_OUTCOME,686978_SCORE,686979_OUTCOME,686979_SCORE,743266_OUTCOME,743266_SCORE,smiles_parent,inchi
0,Inactive,0.0,Inactive,0,Inactive,0,Inactive,0,Inactive,0,...,,,Inactive,0,Inactive,0,Inactive,0,CCOCCCNCC(=O)Nc1ccc(OC(F)(F)F)cc1,JDJXVIDQGFMBLJ-UHFFFAOYSA-N
1,,,Inactive,0,Inactive,0,Inactive,0,Inactive,0,...,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0,COCCn1nnnc1CN1CCC(Cc2ccccc2)CC1,YUFJVPXLDSAWIC-UHFFFAOYSA-N
2,,,Inactive,0,Inactive,0,Inactive,0,Inactive,0,...,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0,COCCn1nnnc1CN1CCC(O)(c2cccc(C(F)(F)F)c2)CC1,OOGABMQFAKYYBW-UHFFFAOYSA-N
3,,,Inactive,0,Inactive,0,Inactive,0,Inactive,0,...,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0,O=C(CN1CCCCCC1)NCCc1ccc(F)cc1,JHEKGAIZRVGERM-UHFFFAOYSA-N
4,Inactive,0.0,Inactive,0,Inactive,0,Inactive,0,Inactive,0,...,Inactive,10.0,Inactive,0,Inactive,0,Inactive,0,COc1ccc(C(=O)C(c2ccccc2)N2CCOCC2)cc1,QMFVJCSHYOSQRY-UHFFFAOYSA-N


In [34]:
PubChemDF.shape

(420901, 22)

In [35]:
PubChemDF.to_csv('PubChemDF.csv')