In [1]:
import pyanitools as pyt
import numpy as np
import os

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE

In [2]:
h5dir = '/home/jsmith48/scratch/qm9/h5files/'
h5out = '/home/jsmith48/scratch/qm9/qm9_comb.h5'

In [3]:
files = [f for f in os.listdir(h5dir) if '.h5' in f]

In [4]:
files

['qm9_data.h5']

In [5]:
isomers = dict()
total_groups=0
for i,f in enumerate(files): 
    print ('Working on file:',i,f)
    adl = pyt.anidataloader(h5dir+f)
    for j,data in enumerate(adl):
        #print(data.keys())
        S = data['species']
        
        unique, counts = np.unique(S, return_counts=True)

        idx = np.argsort(unique)
        #print(unique[idx], counts[idx])

        ep = ""
        for u,c in zip(unique[idx],counts[idx]):
            ep+=u+str(c).zfill(2)
        #print(ep)

        if 'energies' not in data and 'extrapE' not in data:
            print(f, data.keys())

        total_groups+=1

        if ep in isomers:
            isomers[ep].append(data)
        else:
            isomers[ep] = [data]

Working on file: 0 qm9_data.h5


In [6]:
test

NameError: name 'test' is not defined

In [8]:
print(total_groups,'->',len(isomers))
iso_len = []
for k in isomers.keys():
    N = 0
    for data in isomers[k]:
        if 'energies' not in data:
            if 'extrapE' in data:
                N+=data['extrapE'].size
            print(k,data.keys())
        else:
            N+=data['energies'].size
    iso_len.append(N)
print(np.where(np.array(iso_len) < 5)[0].size)

353667 -> 13413
6535


In [7]:
tdata_count = 0
cluster_count = 0
toss_count = 0
dpack = pyt.datapacker(h5out)
for cnt,k in enumerate(isomers.keys()):
    X=[]
    E=[]
    F=[]
    C=[]
    print('count:',cnt)
    for data in isomers[k]:
        
        if data['coordinates'].size != 0:
            S = data['species']
            if '' not in S:
                Na = len(S)
                sidx = np.argsort(S)

                S = np.array(S)[sidx]

                #print(data['coordinates'].shape,sidx,S)
                X.append(data['coordinates'][:,sidx,:])
                if 'energies' not in data:
                    E.append(data['extrapE'])
                    #print('Energies:',data['extrapE'].shape)
                else:
                    E.append(data['energies'])
                    #print('Energies',data['energies'].shape)
                    
                #if 'CM5' not in data:
                    #C.append(data['cm5'][:,0:Na][:,sidx])
                    #print('1',data['cm5'])
                    #print('Charges:',data['cm5'][0:Na].shape,data['coordinates'].shape,Na)
                #else:
                    #C.append(data['CM5'][:,0:Na][:,sidx])
                    #print('2',data['CM5'])
                    #print('Charges:',data['CM5'][0:Na].shape,data['coordinates'].shape,Na)
                
                #F.append(data['forces'][:,sidx,:])
                F.append(0.0*data['coordinates'][:,sidx,:])
            else:
                toss_count += 1
                print(len(data['species']),data['coordinates'].shape[1],len(S) == data['coordinates'].shape[1])
            
    if len(E) > 0:
        #print('ITERATIONS')
        data = {'species':S,
                'coordinates':np.vstack(X),
                'energies':np.concatenate(E),
                'forces':np.vstack(F),
                #'CM5':np.vstack(C),
                }
        
        print('Working on isomers:',k,'Total data:',data['energies'].size)

        # If N data is less than 5 just skip 
        if data['energies'].size <= 500000:
            print('  -Storing '+k+', no clustering, less than 5 data. DATA:',data['energies'].size)
            data.update({'cluster_mask':np.zeros(data['energies'].size,dtype=np.int)})
            data['species'] = list(data['species'])
            
            tdata_count += data['energies'].size
            dpack.store_data('/'+k+'/clust-'+str(0).zfill(3),**data)        
        else: # Else cluster
            # Compute distances
            d = np.zeros((data['energies'].size,int(S.size*(S.size-1)/2)),dtype=np.float64)
            it=0
            for i in range(0,S.size):
                for j in range(i+1,S.size):
                    d[:,it] = np.linalg.norm(data['coordinates'][:,i,:]-data['coordinates'][:,j,:],axis=1)
                    it+=1

            range_n_clusters = np.arange(2,100)
            silhouette_test = -1.0
            prev_cluster_labels = None
            for n_clusters in range_n_clusters:

                # Initialize the clusterer with n_clusters value
                pre_comp = data['energies'].size < 15000
                clusterer = KMeans(n_clusters=n_clusters, precompute_distances=pre_comp, algorithm='elkan')
                cluster_labels = clusterer.fit_predict(d)

                silhouette_avg = silhouette_score(d, cluster_labels,metric="manhattan")
                print("  For n_clusters =", n_clusters,
                      "The average silhouette_score is :", silhouette_avg)

                if silhouette_avg > silhouette_test:
                    silhouette_test = silhouette_avg
                    prev_cluster_labels = cluster_labels
                else:
                    break
            print('  Converged to', n_clusters-1,'clusters.')
            cluster_count += n_clusters-1

            tdata_count += data['energies'].size
            dpack.store_data('/'+k,
                             coordinates=data['coordinates'],
                             species=list(data['species']),
                             energies=data['energies'],
                             #CM5=data['CM5'],
                             forces=data['forces'],
                             cluster_mask=prev_cluster_labels,)
        
#         tsne = TSNE(n_components=2, verbose=0, perplexity=50, n_iter=300)
#         tsne_results = tsne.fit_transform(d)

#         for cid in range(n_clusters-1):
#             idx = np.where(prev_cluster_labels == cid)
#             print(' -Storing Cluster:',cid,' n_elements=',idx[0].size)

#             plt.scatter(tsne_results[idx,0],tsne_results[idx,1])
#         plt.show()

dpack.cleanup()
print(toss_count, tdata_count)

count: 0
Working on isomers: C01H04 Total data: 1
  -Storing C01H04, no clustering, less than 5 data. DATA: 1
count: 1
Working on isomers: H03N01 Total data: 1
  -Storing H03N01, no clustering, less than 5 data. DATA: 1
count: 2
Working on isomers: H02O01 Total data: 1
  -Storing H02O01, no clustering, less than 5 data. DATA: 1
count: 3
Working on isomers: C02H02 Total data: 1
  -Storing C02H02, no clustering, less than 5 data. DATA: 1
count: 4
Working on isomers: C01H01N01 Total data: 1
  -Storing C01H01N01, no clustering, less than 5 data. DATA: 1
count: 5
Working on isomers: C01H02O01 Total data: 1
  -Storing C01H02O01, no clustering, less than 5 data. DATA: 1
count: 6
Working on isomers: C02H06 Total data: 1
  -Storing C02H06, no clustering, less than 5 data. DATA: 1
count: 7
Working on isomers: C01H04O01 Total data: 1
  -Storing C01H04O01, no clustering, less than 5 data. DATA: 1
count: 8
Working on isomers: C03H04 Total data: 1
  -Storing C03H04, no clustering, less than 5 data. 

count: 158
Working on isomers: C07H08 Total data: 28
  -Storing C07H08, no clustering, less than 5 data. DATA: 28
count: 159
Working on isomers: C06H07N01 Total data: 40
  -Storing C06H07N01, no clustering, less than 5 data. DATA: 40
count: 160
Working on isomers: C05H06N02 Total data: 34
  -Storing C05H06N02, no clustering, less than 5 data. DATA: 34
count: 161
Working on isomers: C04H05N03 Total data: 20
  -Storing C04H05N03, no clustering, less than 5 data. DATA: 20
count: 162
Working on isomers: C06H06O01 Total data: 37
  -Storing C06H06O01, no clustering, less than 5 data. DATA: 37
count: 163
Working on isomers: C06H05N01 Total data: 8
  -Storing C06H05N01, no clustering, less than 5 data. DATA: 8
count: 164
Working on isomers: C06H04O01 Total data: 12
  -Storing C06H04O01, no clustering, less than 5 data. DATA: 12
count: 165
Working on isomers: C05H04N02 Total data: 13
  -Storing C05H04N02, no clustering, less than 5 data. DATA: 13
count: 166
Working on isomers: C04H03N03 Total d

Working on isomers: C05H07N01O02 Total data: 878
  -Storing C05H07N01O02, no clustering, less than 5 data. DATA: 878
count: 226
Working on isomers: C05H10N02O01 Total data: 178
  -Storing C05H10N02O01, no clustering, less than 5 data. DATA: 178
count: 227
Working on isomers: C04H10N02O02 Total data: 12
  -Storing C04H10N02O02, no clustering, less than 5 data. DATA: 12
count: 228
Working on isomers: C04H09N01O03 Total data: 39
  -Storing C04H09N01O03, no clustering, less than 5 data. DATA: 39
count: 229
Working on isomers: C05H06N02O02 Total data: 1956
  -Storing C05H06N02O02, no clustering, less than 5 data. DATA: 1956
count: 230
Working on isomers: C03H04N04O01 Total data: 103
  -Storing C03H04N04O01, no clustering, less than 5 data. DATA: 103
count: 231
Working on isomers: C03H05N03O02 Total data: 74
  -Storing C03H05N03O02, no clustering, less than 5 data. DATA: 74
count: 232
Working on isomers: C03H07N03O02 Total data: 13
  -Storing C03H07N03O02, no clustering, less than 5 data. DA

count: 293
Working on isomers: C07H14O01 Total data: 334
  -Storing C07H14O01, no clustering, less than 5 data. DATA: 334
count: 294
Working on isomers: C06H12O02 Total data: 490
  -Storing C06H12O02, no clustering, less than 5 data. DATA: 490
count: 295
Working on isomers: C05H12N02O01 Total data: 13
  -Storing C05H12N02O01, no clustering, less than 5 data. DATA: 13
count: 296
Working on isomers: C08H18 Total data: 18
  -Storing C08H18, no clustering, less than 5 data. DATA: 18
count: 297
Working on isomers: C07H16O01 Total data: 72
  -Storing C07H16O01, no clustering, less than 5 data. DATA: 72
count: 298
Working on isomers: C06H14O02 Total data: 93
  -Storing C06H14O02, no clustering, less than 5 data. DATA: 93
count: 299
Working on isomers: C08H16 Total data: 73
  -Storing C08H16, no clustering, less than 5 data. DATA: 73
count: 300
Working on isomers: C08H14 Total data: 221
  -Storing C08H14, no clustering, less than 5 data. DATA: 221
count: 301
Working on isomers: C07H12O01 Total

Working on isomers: C07H11N01O01 Total data: 5859
  -Storing C07H11N01O01, no clustering, less than 5 data. DATA: 5859
count: 367
Working on isomers: C06H09N01O02 Total data: 5794
  -Storing C06H09N01O02, no clustering, less than 5 data. DATA: 5794
count: 368
Working on isomers: C06H10N02O01 Total data: 2782
  -Storing C06H10N02O01, no clustering, less than 5 data. DATA: 2782
count: 369
Working on isomers: C06H11N01O02 Total data: 2951
  -Storing C06H11N01O02, no clustering, less than 5 data. DATA: 2951
count: 370
Working on isomers: C05H11N01O03 Total data: 235
  -Storing C05H11N01O03, no clustering, less than 5 data. DATA: 235
count: 371
Working on isomers: C04H09N03O02 Total data: 87
  -Storing C04H09N03O02, no clustering, less than 5 data. DATA: 87
count: 372
Working on isomers: C05H09N01O03 Total data: 921
  -Storing C05H09N01O03, no clustering, less than 5 data. DATA: 921
count: 373
Working on isomers: C04H08N02O03 Total data: 95
  -Storing C04H08N02O03, no clustering, less than 

count: 446
Working on isomers: C07H08N02 Total data: 1192
  -Storing C07H08N02, no clustering, less than 5 data. DATA: 1192
count: 447
Working on isomers: C07H06O02 Total data: 589
  -Storing C07H06O02, no clustering, less than 5 data. DATA: 589
count: 448
Working on isomers: C06H07N03 Total data: 819
  -Storing C06H07N03, no clustering, less than 5 data. DATA: 819
count: 449
Working on isomers: C08H11N01 Total data: 1319
  -Storing C08H11N01, no clustering, less than 5 data. DATA: 1319
count: 450
Working on isomers: C08H10O01 Total data: 3182
  -Storing C08H10O01, no clustering, less than 5 data. DATA: 3182
count: 451
Working on isomers: C07H10N02 Total data: 1186
  -Storing C07H10N02, no clustering, less than 5 data. DATA: 1186
count: 452
Working on isomers: C06H09N03 Total data: 777
  -Storing C06H09N03, no clustering, less than 5 data. DATA: 777
count: 453
Working on isomers: C07H08O02 Total data: 3023
  -Storing C07H08O02, no clustering, less than 5 data. DATA: 3023
count: 454
Wor

Working on isomers: C06F01H06N01O01 Total data: 108
  -Storing C06F01H06N01O01, no clustering, less than 5 data. DATA: 108
count: 551
Working on isomers: C05F01H06N03 Total data: 128
  -Storing C05F01H06N03, no clustering, less than 5 data. DATA: 128
count: 552
Working on isomers: C06F01H05O02 Total data: 24
  -Storing C06F01H05O02, no clustering, less than 5 data. DATA: 24
count: 553
Working on isomers: C04F01H05N04 Total data: 84
  -Storing C04F01H05N04, no clustering, less than 5 data. DATA: 84
count: 554
Working on isomers: C04F01H03N02O02 Total data: 85
  -Storing C04F01H03N02O02, no clustering, less than 5 data. DATA: 85
count: 555
Working on isomers: C05F02H04N02 Total data: 37
  -Storing C05F02H04N02, no clustering, less than 5 data. DATA: 37
count: 556
Working on isomers: C05F02H03N01O01 Total data: 26
  -Storing C05F02H03N01O01, no clustering, less than 5 data. DATA: 26
count: 557
Working on isomers: C04F02H02N02O01 Total data: 29
  -Storing C04F02H02N02O01, no clustering, le

  -Storing C05F03H09O01, no clustering, less than 5 data. DATA: 21
count: 617
Working on isomers: C04F03H07N02 Total data: 5
  -Storing C04F03H07N02, no clustering, less than 5 data. DATA: 5
count: 618
Working on isomers: C05F03H10N01 Total data: 10
  -Storing C05F03H10N01, no clustering, less than 5 data. DATA: 10
count: 619
Working on isomers: C04F03H08N01O01 Total data: 6
  -Storing C04F03H08N01O01, no clustering, less than 5 data. DATA: 6
count: 620
Working on isomers: C04F03H07O02 Total data: 8
  -Storing C04F03H07O02, no clustering, less than 5 data. DATA: 8
0 133885


In [8]:
plt.hist(iso_len,bins=100)
plt.yscale('log', nonposy='clip')
plt.show()

NameError: name 'iso_len' is not defined

In [14]:
cluster_count

0

In [9]:
adl = pyt.anidataloader(h5out)
data_count = 0
true_cluster_count = 0
for data in adl:
    data_count += data['energies'].size
    true_cluster_count += data['cluster_mask'].max()+1

In [10]:
print(true_cluster_count,data_count)

621 133885


In [11]:
files = [f for f in os.listdir(h5dir) if '.h5' in f]
total_data=0
for i,f in enumerate(files): 
    print ('Working on file:',i,f)
    adl = pyt.anidataloader(h5dir+f)
    for data in adl:
        total_data += data['energies'].size

Working on file: 0 qm9_data.h5


In [12]:
total_data

133885