In [79]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import itertools

import os
import pickle

import random

from sklearn.preprocessing import MinMaxScaler

import time
import joblib

import argparse
import textwrap

import util.plot as pt

if tf.config.list_physical_devices('GPU'):
    device_name = tf.test.gpu_device_name()
else:
    device_name = '\CPU:0'
    
print(f'device name {device_name}')

device name /device:GPU:0


In [80]:
name = 'data/Fits_4H_dm_phi.npz'

In [81]:
#prepare transformed Helix Fit Parameters for GAN from FitTransform

rr = np.load(f'{name}', allow_pickle=True)
X_train_hfp, y_train_hfp, featnames_hfp = [rr[f] for f in rr.files]

#remove phi1
X_train_hfp = X_train_hfp[:,:-4]

In [82]:
#just confirming that the y_trains indices are still equal
# for i,c in enumerate(y_train_hfp):
#     if c != y_train[i]:
#         print(c)  

In [83]:
clusterLoad = "Clustering/data/refData.npz"
with np.load(f'{clusterLoad}', allow_pickle=True) as rr:
    y_, y_train, X_train, featNames = [rr[f] for f in rr.files]
    
cluster_labels = np.unique(y_)
n_clusters = cluster_labels.shape[0]

# nameList = []

# for x in clusNums:
#     self.nameList.extend(self.y_train[np.where(self.y_==x)])
for x in cluster_labels:
    print(x, y_[y_==x].shape[0])

0 2345
1 472
2 536
3 2105
4 955
5 1625
6 1454
7 1190
8 4160
9 321
10 1495
11 700
12 708
13 187
14 1024
15 531
16 373
17 1162
18 761
19 2155
20 992
21 503
22 696
23 663
24 289
25 475


In [84]:
X_train = X_train_hfp[y_!=7]
y_train = y_train_hfp[y_!=7]

mm = MinMaxScaler(feature_range=(-1,1))
X_train = mm.fit_transform(X_train)

In [85]:
def make_generator_network(num_hidden_layers = 3,num_hidden_units = 64,num_output_units =28):
    
    model = tf.keras.Sequential()
    for i in range(num_hidden_layers):
        model.add(tf.keras.layers.Dense(units=num_hidden_units, use_bias=False))
        model.add(tf.keras.layers.LeakyReLU())
        
    model.add(tf.keras.layers.Dense(units=num_output_units,activation='tanh'))
    
    return model

def make_discriminator_network(num_hidden_layers=3, num_hidden_units=64, num_output_units =1,drop=0.1):
    
    model = tf.keras.Sequential()
    for i in range(num_hidden_layers):
        model.add(tf.keras.layers.Dense(units=num_hidden_units))
        model.add(tf.keras.layers.LeakyReLU())
        model.add(tf.keras.layers.Dropout(rate=drop))
        
    model.add(tf.keras.layers.Dense(units=num_output_units,activation=None))
    
    return model

def saveNetwork_Scaler(genMod,disMod,scaler,name,direc='data/'):
    
    joblib.dump(scaler, f'{direc}{name}_mm.gz')
    genMod.save(f'{direc}{name}.h5',overwrite=True,include_optimizer=True, save_format='h5')
    disMod.save(f'{direc}{name}disc.h5',overwrite=True,include_optimizer=True, save_format='h5')
    
def saveTrainLoss(loss_in,dvals_in,name,direc='data/'):
    
    data = [loss_in,dvals_in]

    with open(f'{direc}{name}.pkl','wb') as f:
        pickle.dump(data, f)   
        
def loadTrainLoss(name,direc='data/'):
 
    with open(f'{direc}{name}.pkl','rb') as f:
        data = pickle.load(f)
    
    all_losses = data[0]
    dval_losses = data[1]
    
    return all_losses, dval_losses

def init_model(X_train, y_train, batch=64 ,zIn=12, g_layers=3, g_size=64, d_layers =3, d_size=64,
               zmode='uniform',discDrop=0.1, normalization=None):

    z_size = zIn
    mode_z = zmode
    gen_hidden_layers= g_layers
    gen_hidden_size = g_size
    disc_hidden_layers = d_layers
    disc_hidden_size = d_size
    batch_size = batch

    h4_feat =X_train.shape[1]

    ds_train = tf.data.Dataset.from_tensor_slices((tf.cast(X_train, tf.float32), y_train))
    ds_train = ds_train.shuffle(X_train.shape[0]) 
    ds_train = ds_train.batch(batch_size, drop_remainder=True)

    with tf.device(device_name):
        gen_model = make_generator_network(num_hidden_layers=gen_hidden_layers,
                                   num_hidden_units = gen_hidden_size,
                                   num_output_units = h4_feat)
        gen_model.build(input_shape=(None,z_size))
        disc_model = make_discriminator_network(num_hidden_layers=disc_hidden_layers,
                                       num_hidden_units =disc_hidden_size,drop=discDrop)
        disc_model.build(input_shape=(None, h4_feat))

    return  gen_model, disc_model, ds_train
    



In [88]:
trainSettings = {'batch':4,
            'input_size':12,
            'input_dist':'uniform',
            'gen_layers':3,
            'gen_size':64,
            'disc_layers':3,
            'disc_size':64,
            'disc_dropOut':0.1,
            'Adam_gen_rate':.0001,
            'Adam_disc_rate':.0001,
            'CheckPointDirec':'checkpoints',
            'CheckPointsKeep':10,
            'epochs':300}

saveLoss=True
nameOut= 'ClusterRemoval_Test'
print(f'X_train shape: {X_train.shape}')
    
nameRun = nameOut
#base_folder = time.strftime('log/%y%b%d_%I%M%p', time.localtime())
base_folder = 'log/GAN'
base_folder = f'{base_folder}{nameRun}/'
if not os.path.exists(base_folder):
    os.makedirs(base_folder)
subfolders = ['checkpoints','loss']
for subfolder in subfolders:
    if not os.path.exists(base_folder + subfolder):
        os.makedirs(base_folder + subfolder)

settings = trainSettings
    
@tf.function   
def train_step(input_z,input_real):
    ##Compute generator's loss
    with tf.GradientTape() as g_tape:
        g_output = gen_model(input_z)
        d_logits_fake = disc_model(g_output, training=True)
        labels_real = tf.ones_like(d_logits_fake)
        g_loss = loss_fn(y_true=labels_real, y_pred = d_logits_fake)

    g_grads = g_tape.gradient(g_loss, gen_model.trainable_variables)

    ##Optimization: Apply the gradients    
    g_optimizer.apply_gradients(grads_and_vars=zip(g_grads,gen_model.trainable_variables))

    ##Compute the discriminators loss
    with tf.GradientTape() as d_tape:
        d_logits_real = disc_model(input_real, training=True)
        d_labels_real = tf.ones_like(d_logits_real)

        d_loss_real = loss_fn(y_true=d_labels_real, y_pred = d_logits_real)

        d_logits_fake = disc_model(g_output, training=True)
        d_labels_fake = tf.zeros_like(d_logits_fake)

        d_loss_fake = loss_fn(y_true=d_labels_fake, y_pred=d_logits_fake)
        d_loss = d_loss_real + d_loss_fake

    ##compute the gradients of d_loss
    d_grads = d_tape.gradient(d_loss, disc_model.trainable_variables)

    ## Optimization : Apply the gradients
    d_optimizer.apply_gradients(grads_and_vars=zip(d_grads,disc_model.trainable_variables))
    d_probs_real = tf.reduce_mean(tf.sigmoid(d_logits_real))
    d_probs_fake = tf.reduce_mean(tf.sigmoid(d_logits_fake))

    return g_loss, d_loss, d_loss_real, d_loss_fake, d_probs_real, d_probs_fake



def train(ds_train, epochs, manager, batch_size, z_size):

    checkpoint.restore(manager.latest_checkpoint)
    if manager.latest_checkpoint:
        print("Restored from {}".format(manager.latest_checkpoint))
    else:
        print("Initializing from scratch.")

    start_time = time.time()
    for epoch in range(epochs):

        epoch_losses, epoch_d_vals = [],[]

        for i, (input_real,name) in enumerate(ds_train):
            input_z = tf.random.uniform(shape=(batch_size, z_size), minval=-1, maxval=1)
            g_loss, d_loss, d_loss_real, d_loss_fake, d_probs_real, d_probs_fake = train_step(input_z,input_real)

            epoch_losses.append(( g_loss.numpy(), d_loss.numpy(), d_loss_real.numpy(), d_loss_fake.numpy()))
            epoch_d_vals.append((d_probs_real.numpy(), d_probs_fake.numpy()))


        checkpoint.step.assign_add(1)
        if (epoch + 1) % 25 == 0:
            save_path = manager.save()
            print("Saved checkpoint for step {}: {}".format(int(checkpoint.step), save_path))


        all_losses.append(epoch_losses)
        all_d_vals.append(epoch_d_vals)

        track = f'Epoch {epoch:03d} |  ET {(time.time()-start_time)/60:.2f} min AvgLosses >> G/D '
        track = f'{track}{(np.mean(all_losses[-1][0],axis=0)):.3f}/{(np.mean(all_losses[-1][1],axis=0)):.3f}'
        track = f'{track} D Real :{(np.mean(all_losses[-1][2],axis=0)):.3f}'
        track = f'{track} D Fake :{(np.mean(all_losses[-1][3],axis=0)):.3f}'

        print(track)




X_train shape: (26687, 28)


In [89]:
gen_model, disc_model, ds_train = init_model(X_train ,y_train, batch=settings['batch'], zIn=settings['input_size'], 
                                             g_layers = settings['gen_layers'], g_size=settings['gen_size'], 
                                             d_layers = settings['disc_layers'],d_size=settings['disc_size'],
                                             zmode=settings['input_dist'], discDrop=settings['disc_dropOut'])

loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
g_optimizer = tf.keras.optimizers.Adam(settings['Adam_gen_rate'])
d_optimizer = tf.keras.optimizers.Adam(settings['Adam_disc_rate'])

checkpoint = tf.train.Checkpoint(generator_optimizer=g_optimizer,discriminator_optimizer=d_optimizer,
                             generator=gen_model,discriminator=disc_model,step=tf.Variable(1))

cpD = settings['CheckPointDirec']
checkpointDirec = f'{base_folder}{cpD}'

manager = tf.train.CheckpointManager(checkpoint,checkpointDirec, max_to_keep=settings['CheckPointsKeep'])
epoch_losses = []
all_losses = []
all_d_vals = []
train(ds_train,settings['epochs'],manager,settings['batch'],settings['input_size'])

outName = f'{nameRun[1:]}'
saveNetwork_Scaler(gen_model, disc_model, mm, outName, direc=f'{base_folder}')

if saveLoss:
    name = f'loss_{nameRun[1:]}'
    saveTrainLoss(all_losses,all_d_vals,name,direc=f'{base_folder}/loss/')

Initializing from scratch.
Epoch 000 |  ET 0.23 min AvgLosses >> G/D 0.815/0.831 D Real :0.808 D Fake :0.813
Epoch 001 |  ET 0.44 min AvgLosses >> G/D 0.850/0.854 D Real :0.818 D Fake :0.816
Epoch 002 |  ET 0.66 min AvgLosses >> G/D 0.929/0.907 D Real :0.867 D Fake :0.901
Epoch 003 |  ET 0.88 min AvgLosses >> G/D 0.919/0.945 D Real :0.869 D Fake :0.831
Epoch 004 |  ET 1.10 min AvgLosses >> G/D 0.888/0.898 D Real :0.798 D Fake :0.867
Epoch 005 |  ET 1.32 min AvgLosses >> G/D 0.833/0.859 D Real :0.883 D Fake :0.761
Epoch 006 |  ET 1.54 min AvgLosses >> G/D 0.897/0.810 D Real :0.933 D Fake :0.882
Epoch 007 |  ET 1.75 min AvgLosses >> G/D 0.802/0.870 D Real :0.937 D Fake :0.899
Epoch 008 |  ET 1.97 min AvgLosses >> G/D 0.808/0.763 D Real :0.714 D Fake :0.890
Epoch 009 |  ET 2.18 min AvgLosses >> G/D 0.960/0.849 D Real :1.032 D Fake :0.798
Epoch 010 |  ET 2.40 min AvgLosses >> G/D 0.889/0.800 D Real :0.796 D Fake :0.885
Epoch 011 |  ET 2.61 min AvgLosses >> G/D 0.810/0.866 D Real :0.794 D F

Epoch 097 |  ET 22.31 min AvgLosses >> G/D 0.747/0.911 D Real :0.822 D Fake :0.753
Epoch 098 |  ET 22.53 min AvgLosses >> G/D 0.856/0.638 D Real :1.013 D Fake :0.951
Saved checkpoint for step 101: log/GANClusterRemoval_Test/checkpoints\ckpt-4
Epoch 099 |  ET 22.75 min AvgLosses >> G/D 0.801/0.830 D Real :0.718 D Fake :0.904
Epoch 100 |  ET 22.97 min AvgLosses >> G/D 0.718/0.821 D Real :0.722 D Fake :0.839
Epoch 101 |  ET 23.19 min AvgLosses >> G/D 0.866/0.978 D Real :0.782 D Fake :1.306
Epoch 102 |  ET 23.41 min AvgLosses >> G/D 0.830/0.821 D Real :0.917 D Fake :0.799
Epoch 103 |  ET 23.63 min AvgLosses >> G/D 0.907/0.980 D Real :1.062 D Fake :0.904
Epoch 104 |  ET 23.85 min AvgLosses >> G/D 0.628/0.748 D Real :0.916 D Fake :0.890
Epoch 105 |  ET 24.07 min AvgLosses >> G/D 0.731/0.729 D Real :1.003 D Fake :0.828
Epoch 106 |  ET 24.30 min AvgLosses >> G/D 0.900/1.252 D Real :0.825 D Fake :0.680
Epoch 107 |  ET 24.51 min AvgLosses >> G/D 1.072/0.726 D Real :0.690 D Fake :1.004
Epoch 108 

Epoch 192 |  ET 43.69 min AvgLosses >> G/D 0.751/0.762 D Real :0.842 D Fake :0.807
Epoch 193 |  ET 43.92 min AvgLosses >> G/D 1.091/0.959 D Real :0.715 D Fake :1.071
Epoch 194 |  ET 44.14 min AvgLosses >> G/D 0.725/0.980 D Real :1.014 D Fake :0.655
Epoch 195 |  ET 44.37 min AvgLosses >> G/D 0.847/0.970 D Real :0.884 D Fake :0.848
Epoch 196 |  ET 44.59 min AvgLosses >> G/D 0.688/0.694 D Real :0.620 D Fake :0.750
Epoch 197 |  ET 44.82 min AvgLosses >> G/D 0.844/0.787 D Real :0.939 D Fake :0.754
Epoch 198 |  ET 45.05 min AvgLosses >> G/D 0.943/0.772 D Real :0.846 D Fake :1.001
Saved checkpoint for step 201: log/GANClusterRemoval_Test/checkpoints\ckpt-8
Epoch 199 |  ET 45.28 min AvgLosses >> G/D 0.771/1.252 D Real :0.832 D Fake :0.657
Epoch 200 |  ET 45.51 min AvgLosses >> G/D 0.973/0.812 D Real :0.870 D Fake :0.861
Epoch 201 |  ET 45.74 min AvgLosses >> G/D 0.694/0.737 D Real :0.804 D Fake :0.887
Epoch 202 |  ET 45.97 min AvgLosses >> G/D 0.683/0.949 D Real :0.763 D Fake :0.840
Epoch 203 

Epoch 287 |  ET 65.03 min AvgLosses >> G/D 0.961/0.939 D Real :1.152 D Fake :0.842
Epoch 288 |  ET 65.25 min AvgLosses >> G/D 0.716/0.764 D Real :1.011 D Fake :0.643
Epoch 289 |  ET 65.48 min AvgLosses >> G/D 0.817/0.631 D Real :0.895 D Fake :0.931
Epoch 290 |  ET 65.70 min AvgLosses >> G/D 0.761/0.818 D Real :1.289 D Fake :0.719
Epoch 291 |  ET 65.92 min AvgLosses >> G/D 0.796/0.797 D Real :0.827 D Fake :0.671
Epoch 292 |  ET 66.14 min AvgLosses >> G/D 1.043/1.115 D Real :1.036 D Fake :0.669
Epoch 293 |  ET 66.36 min AvgLosses >> G/D 0.766/0.674 D Real :0.770 D Fake :0.729
Epoch 294 |  ET 66.58 min AvgLosses >> G/D 1.083/0.785 D Real :0.890 D Fake :0.734
Epoch 295 |  ET 66.81 min AvgLosses >> G/D 0.735/0.967 D Real :0.942 D Fake :0.848
Epoch 296 |  ET 67.03 min AvgLosses >> G/D 0.732/0.760 D Real :0.934 D Fake :0.850
Epoch 297 |  ET 67.27 min AvgLosses >> G/D 0.941/0.883 D Real :0.838 D Fake :0.915
Epoch 298 |  ET 67.50 min AvgLosses >> G/D 0.789/1.150 D Real :0.791 D Fake :0.803
Save

In [33]:
import LoopEndpoints as le
import GenerateEndpoints as ge

In [90]:
gen_name = 'log/GANClusterRemoval_Test/lusterRemoval_Test'

In [92]:
le.bb_analyze('log/GANClusterRemoval_Test/lusterRemoval_Test',batch=32,z=12,analysisOnly=True,
               outDirec='output/', print_output=True)

Structures Generation Attempts: 32
MSE for recon is 0.10 Angstroms
Elapsed time: 0.76
0.02s per structure
No Clash Structures: 8
Two Atoms or less Clash Structures: 23
Clashed Atoms Mean: 1.81 +/- 1.26
Percent Core: 0.13 +/- 0.06
 
 
cycle: 0
first helix: #1
first loop: #9
second helix: #0
fail
first helix: #1
first loop: #24
second helix: #1
fail2
first helix: #1
first loop: #10
second helix: #0
fail
first helix: #1
first loop: #17
second helix: #0
fail
first helix: #1
first loop: #20
second helix: #1
fail2
first helix: #1
first loop: #24
second helix: #0
fail
first helix: #1
first loop: #25
second helix: #0
fail
first helix: #1
first loop: #26
second helix: #0
fail
first helix: #1
first loop: #19
second helix: #0
fail
first helix: #1
first loop: #25
second helix: #3
3 helix: #11
4 helix: #35
Num_Structs9x:    21
first helix: #1
first loop: #18
second helix: #0
fail
first helix: #1
first loop: #23
second helix: #12
3 helix: #9
4 helix: reduced to #20
4 helix: #20
Num_Structs11x:    15

In [38]:
br = ge.BatchRecon(name=gen_name)
br.generate(12,batch_size=32)
endpoint_list = br.MDS_reconstruct_()

In [76]:
bf = ge.BatchRecon(name='data/BestGenerator')

In [78]:
bf.generate(12)

array([[16.316128 , 16.554947 , 16.334831 , 18.457684 , 16.125488 ,
        19.339027 , 21.109901 ,  9.20863  , 26.600903 , 27.636366 ,
        12.6324415, 13.341254 , 23.701595 , 21.998808 , 21.299803 ,
        14.397279 ,  8.573537 , 15.080368 , 12.149821 , 27.943943 ,
        26.60883  , 18.371593 , 24.672335 , 21.73369  , 11.703793 ,
        10.7223015, 22.248615 , 15.062401 ],
       [18.086597 , 19.779205 , 15.041035 ,  9.039525 , 24.438576 ,
        22.060396 , 11.169869 ,  6.098048 , 26.44907  , 22.686506 ,
        17.181595 , 11.940178 , 18.924473 , 24.58757  , 22.94601  ,
        14.365549 , 10.581507 , 21.758982 , 10.608971 , 26.232204 ,
        27.723244 , 22.20776  , 23.262676 , 22.763758 , 11.990326 ,
         8.342196 , 20.643728 , 17.093775 ],
       [20.920176 , 18.241219 , 14.887065 , 18.552011 , 22.265125 ,
        15.801541 ,  9.991272 , 10.577459 , 29.845781 , 26.438082 ,
        16.89434  , 13.016779 , 25.473948 , 22.265509 , 17.461037 ,
        19.032284 , 14.674

In [69]:
p1 = endpoint_list[0][3]
p2 =endpoint_list[0][4]

In [70]:
np.linalg.norm(p2-p1)

0.09320673341737556

In [74]:
br.to_npose()

ZeroDivisionError: division by zero

In [72]:
br.g_output

array([[-0.45388815, -0.39153424, -0.5654259 , -0.29602832,  0.10598585,
        -0.34222355, -0.49578336,  0.2950252 , -0.03136964,  0.27067   ,
        -0.14823505, -0.40482584,  0.21652366, -0.06708521, -0.1749794 ,
        -0.61948335, -0.43025467,  0.1311989 ,  0.3221391 ,  0.5648815 ,
         0.3677734 , -0.10408167,  0.37132198,  0.35714194, -0.3694754 ,
         0.05001154, -0.05818842, -0.08190887],
       [ 0.5215131 ,  0.533925  , -0.67488575, -0.05551006, -0.00966968,
         0.559437  ,  0.8130951 , -0.45586276, -0.44811082, -0.20689699,
        -0.22735685, -0.42638406, -0.23950315, -0.25444758, -0.2390385 ,
         0.09110764, -0.17467953, -0.33338562,  0.05443709, -0.3207543 ,
         0.07227753,  0.19207914, -0.14749111, -0.17264353, -0.59601164,
        -0.00590914,  0.3115075 ,  0.02066375],
       [ 0.11670186, -0.23603064, -0.31616   , -0.20768537, -0.36390665,
        -0.11516646,  0.42604345,  0.0313697 ,  0.24065383,  0.4387496 ,
        -0.06719489, -0.2592