In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from matminer.featurizers.structure import XRDPowderPattern
import plotly.express as px

In [11]:

def tsne_algo(X, perplexity, labels_true):
    
    from sklearn.manifold import TSNE
    n_components = 3
    Xtsne = TSNE(n_components, perplexity=perplexity).fit_transform(X)
    dftsne = pd.DataFrame(Xtsne)
    dftsne['labels'] = labels_true
    return dftsne


In [47]:
def kmeans_nmf(df):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4)
    labels = kmeans.fit_predict(df)
    df['cluster'] = labels
    df['labels'] = labels_true
    return df

def kmeans_tsne(dftsne):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4)
    tsne_labels = kmeans.fit_predict(dftsne[[0, 1, 2]])
    dftsne['cluster'] = tsne_labels
    dftsne.columns = ['x1','x2','x3','labels', 'cluster']
    return dftsne

In [23]:
def sort_clusterlabels(dftsne_kmeans, label_count):
    dftsne_times = dftsne_kmeans.groupby(["labels", "cluster"]).size().reset_index(name="Time")
    
    m_df = dftsne_times[dftsne_times['labels'].str.match('m')]
    po_df = dftsne_times[dftsne_times['labels'].str.match('p-o')]
    o_df = dftsne_times[dftsne_times['labels'].str.match('o')]
    t_df = dftsne_times[dftsne_times['labels'].str.match('t')]
    
    if(label_count == 5):
        unknown_df = dftsne_times[dftsne_times['labels'].str.match('unknown')]
        unk_newlabel = unknown_df.loc[unknown_df['Time'] == unknown_df.Time.max(), 'cluster'].values[0]
        dftsne_kmeans = dftsne_kmeans.replace({'unknown': unk_newlabel})
        
    m_newlabel = m_df.loc[m_df['Time'] == m_df.Time.max(), 'cluster'].values[0]
    po_newlabel = po_df.loc[po_df['Time'] == po_df.Time.max(),'cluster'].values[0] 
    o_newlabel = o_df.loc[o_df['Time'] == o_df.Time.max(), 'cluster'].values[0]
    t_newlabel = t_df.loc[t_df['Time'] == t_df.Time.max(), 'cluster'].values[0]
    
    dftsne_kmeans = dftsne_kmeans.replace({'m': m_newlabel,
                                           'p-o': po_newlabel,
                                           'o': o_newlabel,
                                           't': t_newlabel})
    return dftsne_kmeans


In [32]:
df = pd.read_pickle(r"C:\Python\Projects\crystal-phase-prediction"
                    r"\pkl_files\structure_df_hfo2_La_del.pkl")

df_labels = pd.read_pickle(r"C:\Python\Projects\crystal-phase-prediction"
                    r"\pkl_files\structure_labels_La_defects.pkl")

dotants = pd.read_pickle(r'C:\Python\Projects\crystal-phase-prediction'
                         r'\pkl_files\probe_data.pkl')
dot = dotants['dopant']
dot = np.unique(dot)
dot = dot.tolist()
dot.remove('Hf')
dot.remove('La')

labels_true = df_labels['labels_0_3']
#df = df.drop(columns=['labels_0_3'])


In [26]:
xrd = XRDPowderPattern(two_theta_range=(5, 65))
df = xrd.fit_featurize_dataframe(df, 'structure')

X = df.iloc[:, 2:]

HBox(children=(FloatProgress(value=0.0, description='XRDPowderPattern', max=339.0, style=ProgressStyle(descrip…




In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

array([[1.92805301e-004, 9.92476004e-003, 1.40308253e-001, ...,
        2.89870765e-001, 4.77042794e-002, 2.82679706e-004],
       [6.38533523e-006, 9.54339216e-004, 3.32179780e-002, ...,
        2.96798441e-001, 4.31046644e-002, 1.91156822e-004],
       [3.79782334e-038, 4.34227870e-030, 1.25795400e-022, ...,
        2.87201363e-001, 9.43116272e-002, 3.76286020e-002],
       ...,
       [2.79202068e-118, 1.43197608e-102, 2.39357677e-087, ...,
        4.02743422e-001, 2.60293383e-001, 8.90383557e-002],
       [2.04495696e-006, 2.17311198e-004, 5.13481154e-003, ...,
        3.58219299e-001, 2.04765852e-001, 1.57080522e-001],
       [8.58684073e-171, 2.04901444e-149, 8.26668576e-129, ...,
        2.75470424e-001, 4.12376443e-001, 3.16147283e-001]])

In [39]:
perplexity = list(range(1, 80, 1))  # perplexity values to test
labels_count = 4  # select number of lables

performance = []
for perplex in perplexity:
    dftsne = tsne_algo(X, perplex, labels_true)
    dftsne_kmeans = kmeans_tsne(dftsne)
    dftsne_kmeans = sort_clusterlabels(dftsne_kmeans, labels_count)
    acc = accuracy_score(dftsne_kmeans['labels'], dftsne_kmeans['cluster'])
    performance.append(acc)
    
perf_dic = dict(zip(perplexity, performance))
print('Best value of performance: '
      + str(max(perf_dic.values()))
      + ' Perplexity = '
      + str(max(perf_dic, key=perf_dic.get)))

print(perf_dic)

Best value of performance: 0.9911504424778761 Perplexity = 5
{1: 0.31268436578171094, 2: 0.30973451327433627, 3: 0.44837758112094395, 4: 0.6194690265486725, 5: 0.9911504424778761, 6: 0.5752212389380531, 7: 0.5073746312684366, 8: 0.5899705014749262, 9: 0.6755162241887905, 10: 0.56047197640118, 11: 0.6637168141592921, 12: 0.49557522123893805, 13: 0.6991150442477876, 14: 0.5929203539823009, 15: 0.6194690265486725, 16: 0.7256637168141593, 17: 0.7050147492625368, 18: 0.6371681415929203, 19: 0.7463126843657817, 20: 0.640117994100295, 21: 0.7463126843657817, 22: 0.6519174041297935, 23: 0.6814159292035398, 24: 0.6519174041297935, 25: 0.5545722713864307, 26: 0.7522123893805309, 27: 0.7168141592920354, 28: 0.6666666666666666, 29: 0.6342182890855457, 30: 0.7581120943952803, 31: 0.6755162241887905, 32: 0.6489675516224189, 33: 0.7109144542772862, 34: 0.6902654867256637, 35: 0.6991150442477876, 36: 0.6194690265486725, 37: 0.672566371681416, 38: 0.6017699115044248, 39: 0.584070796460177, 40: 0.702064

In [43]:
fig = px.scatter_3d(dftsne, x='x1', y='x2', z='x3', color=dftsne['labels'], 
                    title='TSNE 3D Ground Truth')
fig.show()

In [44]:
fig = px.scatter_3d(dftsne, x='x1', y='x2', z='x3', color=dftsne['cluster'], 
                    title='TSNE 3D Ground Truth')
fig.show()


In [45]:
# Hyperparameter Testing
def nmf_algo(X, comp):
    from sklearn.decomposition import NMF

    model = NMF(n_components=comp, init='random', random_state=0, max_iter = 8000)
    nmf_features_W = model.fit_transform(X)
    nmf_componentes_H = model.components_
    nmf_df = pd.DataFrame(nmf_componentes_H.T)
    W_df = pd.DataFrame(nmf_features_W) # weights represent abundence of phase at a given nominal composition
    return W_df

In [46]:
from sklearn.metrics import accuracy_score
components = list(range(3, 50 , 1))
performance = []
i = 1
for component in components: 
    W_df = nmf_algo(X, component)
    W_df_kmeans = kmeans_nmf(W_df)
    W_df_kmeans = sort_clusterlabels(W_df_kmeans, 4)
    acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
    performance.append(acc)
    print("Round: " + str(i))
    i = i + 1
       
perf_dic = dict(zip(components, performance))
print('Best value of performance: ' + str(max(perf_dic.values())) + ' Components = ' + str(max(perf_dic, key=perf_dic.get)))

Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Best value of performance: 0.9587020648967551 Components = 6


In [75]:
components = 5
W_df = nmf_algo(X, components)
W_df_kmeans = kmeans_algo(W_df)

names = []
for i in range(1, components +1): 
    names.append( 'x' + str(i))

signs = ['cluster', 'labels'] 
W_df_kmeans.columns = names + signs
W_df_kmeans

Unnamed: 0,x1,x2,x3,x4,x5,cluster,labels
0,0.000000,0.810441,0.081738,0.334968,0.000000,3,t
1,0.000000,0.822409,0.092329,0.213510,0.000000,3,t
2,0.000000,0.861121,0.120833,0.215632,0.000000,3,t
3,0.047541,0.720460,0.217542,0.640203,0.000000,3,t
4,0.000000,0.608734,0.349591,0.830881,0.000000,3,t
...,...,...,...,...,...,...,...
334,0.000000,0.790675,0.118885,0.000000,0.000000,3,t
335,0.978925,0.000000,0.140243,0.111413,0.051271,1,m
336,0.106494,0.153028,0.918500,0.000000,0.000000,0,o
337,0.654386,0.185501,0.174821,0.081362,0.000000,1,p-o


In [76]:
fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['cluster'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Prediction'
)
fig.show()

In [77]:
import plotly.express as px
fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['labels'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Ground Truth'
)
fig.show()

In [78]:
W_df_kmeans = sort_clusterlabels(W_df_kmeans, 4)
acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
print('Acc : ' + str(acc))

Acc : 0.9587020648967551
