In [16]:
from rdkit import Chem
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch_geometric.data import Data
import pickle

# Run on CUDA

In [2]:
def format_pytorch_version(version):
    return version.split('+')[0]

TORCH_version = torch.__version__
TORCH = format_pytorch_version(TORCH_version)

def format_cuda_version(version):
    return 'cu' + version.replace('.', '')

CUDA_version = torch.version.cuda
CUDA = format_cuda_version(CUDA_version)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

# Load Data

In [12]:
df = pd.read_csv('data/test/test.csv')
smiles = pd.read_csv('data/test/smiles_test.csv')
data = df.join(smiles.set_index('MOFname'), on='MOFname')

data = data.dropna(subset=['Smiles'])
data = data.reset_index(drop=True)
print(data.isnull().sum())
print(data.shape)

MOFname                                          0
volume [A^3]                                     0
weight [u]                                       0
surface_area [m^2/g]                             0
void_fraction                                    0
void_volume [cm^3/g]                             0
functional_groups                                0
metal_linker                                     0
organic_linker1                                  0
organic_linker2                                  0
topology                                         0
CO2/N2_selectivity                               0
heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]    0
Smiles                                           0
dtype: int64
(17000, 14)


In [13]:
x_map = {
    'atomic_num':
    list(range(0, 119)),
    'chirality': [
        'CHI_UNSPECIFIED',
        'CHI_TETRAHEDRAL_CW',
        'CHI_TETRAHEDRAL_CCW',
        'CHI_OTHER',
    ],
    'degree':
    list(range(0, 11)),
    'formal_charge':
    list(range(-5, 7)),
    'num_hs':
    list(range(0, 9)),
    'num_radical_electrons':
    list(range(0, 5)),
    'hybridization': [
        'UNSPECIFIED',
        'S',
        'SP',
        'SP2',
        'SP3',
        'SP3D',
        'SP3D2',
        'OTHER',
    ],
    'is_aromatic': [False, True],
    'is_in_ring': [False, True],
}

e_map = {
    'bond_type': [
        'misc',
        'SINGLE',
        'DOUBLE',
        'TRIPLE',
        'AROMATIC',
    ],
    'stereo': [
        'STEREONONE',
        'STEREOZ',
        'STEREOE',
        'STEREOCIS',
        'STEREOTRANS',
        'STEREOANY',
    ],
    'is_conjugated': [False, True],
}

In [14]:
data_list = []
data_dict = []
nan_idx = []
c = 1
for _, line in data.iterrows():
    mol = Chem.MolFromSmiles(line['Smiles'])
    
    if mol == None:
        nan_idx.append(_)
        continue
    
    # Create Node Features
    xs = []
    for atom in mol.GetAtoms():
        x = []
        x.append(x_map['atomic_num'].index(atom.GetAtomicNum()))
        x.append(x_map['chirality'].index(str(atom.GetChiralTag())))
        x.append(x_map['degree'].index(atom.GetTotalDegree()))
        x.append(x_map['formal_charge'].index(atom.GetFormalCharge()))
        x.append(x_map['num_hs'].index(atom.GetTotalNumHs()))
        x.append(x_map['num_radical_electrons'].index(atom.GetNumRadicalElectrons()))
        x.append(x_map['hybridization'].index(str(atom.GetHybridization())))
        x.append(x_map['is_aromatic'].index(atom.GetIsAromatic()))
        x.append(x_map['is_in_ring'].index(atom.IsInRing()))
        xs.append(x)
    x = torch.tensor(xs, dtype=torch.float).view(-1, 9)
    
    # Create Edge Features
    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        e = []
        e.append(e_map['bond_type'].index(str(bond.GetBondType())))
        e.append(e_map['stereo'].index(str(bond.GetStereo())))
        e.append(e_map['is_conjugated'].index(bond.GetIsConjugated()))

        edge_indices += [[i, j], [j, i]]
        edge_attrs += [e, e]

    edge_index = torch.tensor(edge_indices)
    edge_index = edge_index.t().to(torch.long).view(2, -1)
    edge_attr = torch.tensor(edge_attrs, dtype=torch.long).view(-1, 3)

    # Sort indices.
    if edge_index.numel() > 0:
        perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
        edge_index, edge_attr = edge_index[:, perm], edge_attr[perm]

    data_d = Data(x=x, edge_index=edge_index, edge_attr=edge_attr,smiles=line['Smiles'])
    data_d.num_nodes = len(mol.GetAtoms())
    data_list.append(data_d)
    data_dict.append(line['MOFname'])
    
    if(c%1000==0):
        print('done:',c)
    c=c+1

done: 1000
done: 2000
done: 3000
done: 4000
done: 5000
done: 6000
done: 7000
done: 8000
done: 9000
done: 10000
done: 11000
done: 12000
done: 13000
done: 14000
done: 15000
done: 16000


In [24]:
save_data_list = False
if save_data_list:
    pickle.dump(data_dict, open('data/test/graph_test_MOFname.pkl', 'wb'))

In [55]:
import pickle

data_list = pickle.load(open('data/train/graph_train_2_loss.pkl', 'rb'))

In [56]:
data_list[0].y.shape

torch.Size([1, 2])