In [3]:
import pandas as pd
df = pd.read_csv('HW21.csv')
df.head()

Unnamed: 0,formula,H,He,Li,Be,B,C,N,O,F,...,Mt,Ds,Rg,Cn,Nh,Fl,Mc,Lv,Ts,Og
0,Si1 C1,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Sc3 Sn1 B1,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Na1 Ga1 Te2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Nb1 In1 S2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,La1 N1,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Data Loader

In [7]:
import pandas as pd
import torch
from time import time
from torch.utils.data import Dataset, DataLoader

class Data(Dataset):
  def __init__(self):
    df = pd.read_csv('HW21.csv')
    self.Xs = torch.Tensor(df.iloc[:,1:].to_numpy())

  def __len__(self):
    return self.Xs.shape[0]

  def __getitem__(self, idx):
    x = self.Xs[idx,:]
    return idx, x

# Loading data
print('loading data...',end=''); t = time()
data = Data()
dataloader = DataLoader(data, batch_size=64, shuffle=True)
print('completed', time()-t,'sec')

loading data...completed 0.22204065322875977 sec


### Model

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# adaopted from
# https://github.com/Zeleni9/pytorch-wgan/blob/master/models/wgan_gradient_penalty.py

class Generator(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.main_module = nn.Sequential(
            # Z latent vector 128
            nn.Linear(128,128),
            nn.BatchNorm1d(num_features=128),
            nn.SELU(),
            nn.Linear(128,128),
            nn.BatchNorm1d(num_features=128),
            nn.SELU(),
            nn.Linear(128,118),
            nn.Softmax())
    def forward(self, x):
        return  self.main_module(x)

class Discriminator(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.main_module = nn.Sequential(
            nn.Linear(118,128),
            nn.BatchNorm1d(num_features=128),
            nn.SELU(),
            nn.Linear(128,128),
            nn.BatchNorm1d(num_features=128),
            nn.SELU(),
            nn.Linear(128,1))

    def forward(self, x):
        return self.main_module(x)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#build model
D = Discriminator().to(device)
G = Generator().to(device)


# Using Model

In [21]:
from time import time
import random
import numpy as np
import torch.optim as optim
from torch import autograd

def calculate_gradient_penalty(D, real_xs, fake_xs, lambdaa=10):

    eta = torch.FloatTensor(real_xs.shape[0],118).uniform_(0,1)
    eta = eta.to(device='cuda')

    interpolated = eta * real_xs + ((1 - eta) * fake_xs)

    # calculate probability of interpolated examples
    prob_interpolated = D(interpolated)

    # calculate gradients of probabilities with respect to examples
    gradients = autograd.grad(outputs=prob_interpolated, inputs=interpolated,
                            grad_outputs=torch.ones(prob_interpolated.size()).cuda(),
                            create_graph=True, retain_graph=True)[0]
    grad_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lambdaa
    return grad_penalty

def use_model(data_loader, D,G, criterion, Doptimizer, Goptimizer, i_iter,  name = None):
  #switch to model mode
  G.train()
  D.train()
  D_losses = []
  G_losses = []
  WasDs = []
  one = torch.tensor(1, dtype=torch.float)
  mone = one * -1
  if next(D.parameters()).is_cuda:
    one = one.to(device='cuda')
    mone = mone.to(device='cuda')
  for idxs, xs in data_loader: # loop for each batch
    ### training discriminator
    for p in D.parameters():
        p.requires_grad = True
    z = torch.randn(xs.shape[0], 128)
    # move input to cuda
    if next(D.parameters()).is_cuda:
      xs = xs.to(device='cuda')
      z = z.to(device='cuda')
    #compute output
    # real data loss
    Dreal_loss = D(xs)
    fake_xs = G(z)
    Dfake_loss = D(fake_xs)
    # real data back propagation
    Doptimizer.zero_grad()
    Dreal_loss = Dreal_loss.mean()
    Dreal_loss.backward(mone, retain_graph=True)
    Dfake_loss = Dfake_loss.mean()
    Dfake_loss.backward(one, retain_graph=True)
    gradient_penalty = calculate_gradient_penalty(D, xs, fake_xs)
    gradient_penalty.backward(retain_graph=True)
    Doptimizer.step()
    # recording loss
    d_loss = Dreal_loss - Dfake_loss + gradient_penalty
    WasD = Dreal_loss - Dfake_loss
    D_losses.append(float(d_loss))
    WasDs.append(float(WasD))
    
    ### training generator
    for p in D.parameters():
        p.requires_grad = False
    z = torch.randn(xs.shape[0], 128)
    # move input to cuda
    if next(D.parameters()).is_cuda:
      z = z.to(device='cuda')

    fake_xs = G(z)
    Dfake_loss = D(fake_xs)
    Goptimizer.zero_grad()
    Dfake_loss = Dfake_loss.mean()
    Dfake_loss.backward(mone)
    Goptimizer.step()
    g_loss = Dfake_loss
    G_losses.append(float(g_loss))

  return np.mean(D_losses),np.mean(G_losses),np.mean(WasDs)

# Model Training and Validation

In [10]:
from torch.utils.data import DataLoader


################################ Input ####################################
# data
TrainValTeSplitst = [1.0, 0.0, 0.0]

# Training
batch_size = 64
lr = 0.001
nepochs = 1000
cuda = True
seed = 1234
###########################################################################


## Training
Doptimizer = optim.Adam(D.parameters(),lr,weight_decay= 0) # 0 means no penalty
Goptimizer = optim.Adam(G.parameters(),lr,weight_decay= 0) # 0 means no penalty

best_train_loss = float('inf')
for i_iter in range(nepochs): # epochs
  Dloss, Gloss, WD = use_model(dataloader_train,D,G,None,Doptimizer,Goptimizer,i_iter) # training model
  print('Train loss [%03d]: %10.2e %10.2e %10.2e'%(i_iter, Dloss, Gloss, WD))
  torch.save(D.state_dict(),'DW.pth.tar') # we save the data
  torch.save(G.state_dict(),'GW.pth.tar') # we save the data
  
G.load_state_dict(torch.load('GW.pth.tar'))
G.eval()
z = torch.randn(64, 128)
# move input to cuda
if next(G.parameters()).is_cuda:
  z = z.to(device='cuda')
with torch.no_grad(): # it does not compute the gradient. so it's faster
  fake_xs = G(z)


NameError: name 'data_train' is not defined

In [23]:
z = torch.randn(64, 128)
# move input to cuda
if next(G.parameters()).is_cuda:
  z = z.to(device='cuda')
with torch.no_grad(): # it does not compute the gradient. so it's faster
  fake_xs = G(z)

In [30]:
import numpy as np
chemical_symbols = [
    # 0
    'X',
    # 1
    'H', 'He',
    # 2
    'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
    # 3
    'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar',
    # 4
    'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
    'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr',
    # 5
    'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd',
    'In', 'Sn', 'Sb', 'Te', 'I', 'Xe',
    # 6
    'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy',
    'Ho', 'Er', 'Tm', 'Yb', 'Lu',
    'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi',
    'Po', 'At', 'Rn',
    # 7
    'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk',
    'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr',
    'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc',
    'Lv', 'Ts', 'Og']
xs = np.array(fake_xs.tolist())
xs[xs <0.01] = 0
for j in range(xs.shape[0]):
  nonzero = np.where(xs[j,:])[0]
  s = ''
  for i in nonzero:
    s += chemical_symbols[i-1] + '%.2f'%xs[j,i] +' '
  print(s)


Re1.00 
C0.02 Re0.98 
Re1.00 
Re0.99 
C0.07 Re0.93 
Re1.00 
Re1.00 
Re1.00 
C0.05 Re0.95 
Re1.00 
Re1.00 
C0.07 Re0.93 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
C0.02 Re0.98 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re0.99 
Re0.99 
Re1.00 
C0.03 Re0.97 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
C0.01 Re0.99 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
C0.01 Re0.99 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
Re1.00 
C0.09 Re0.91 
Re1.00 
Re1.00 
Re1.00 
Re1.00 


# Making Data
For your information, the code below was used to make the data

In [None]:
from pymatgen.ext.matproj import MPRester
import numpy as np
from ase.data import chemical_symbols
from pymatgen.core import Composition
import csv

with MPRester("gv5swR0lIWsfLsGk") as m:
    docs = m.query({},{'pretty_formula':True,'icsd_ids':True})
    raw_data = [(doc['pretty_formula'],doc['icsd_ids']) for doc in docs]

new_data = {}
for f,v in raw_data:
    if len(v) != 0:
        c = Composition(f)
        new_data[str(c)] = dict(c.fractional_composition)

sym_map = {}
for i,s in enumerate(chemical_symbols):
    sym_map[s] = i -1

data = np.zeros((len(new_data),len(chemical_symbols)-1))
names = []
for i,(name,d) in enumerate(new_data.items()):
    names.append(name)
    for k,v in d.items():
        data[i,sym_map[str(k)]] = v

data_to_copy = [['formula']+chemical_symbols[1:]]
for n, r in zip(names,data):
    data_to_copy.append([n]+r.tolist())

with open('data.csv','w',newline='') as f:
    writer = csv.writer(f)
    for l in data_to_copy:
        writer.writerow(l)