In [42]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from matminer.featurizers.structure import XRDPowderPattern
import plotly.express as px

In [97]:

def tsne_algo(X, perplexity, labels_true):
    
    from sklearn.manifold import TSNE
    n_components = 3
    Xtsne = TSNE(n_components, perplexity=perplexity).fit_transform(X)
    dftsne = pd.DataFrame(Xtsne)
    dftsne['labels'] = labels_true
    return dftsne


In [110]:
def kmeans_nmf(df, n_clusters):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=n_clusters)
    labels = kmeans.fit_predict(df)
    df['cluster'] = labels
    df['labels'] = labels_true
    return df

def kmeans_tsne(dftsne, n_clusters):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=n_clusters)
    tsne_labels = kmeans.fit_predict(dftsne[[0, 1, 2]])
    dftsne['cluster'] = tsne_labels
    dftsne.columns = ['x1','x2','x3','labels','cluster']
    return dftsne

In [45]:
# Hyperparameter Testing
def nmf_algo(X, comp):
    from sklearn.decomposition import NMF

    model = NMF(n_components=comp, init='random', random_state=0, max_iter = 8000)
    nmf_features_W = model.fit_transform(X)
    nmf_componentes_H = model.components_
    nmf_df = pd.DataFrame(nmf_componentes_H.T)
    W_df = pd.DataFrame(nmf_features_W) # weights represent abundence of phase at a given nominal composition
    return W_df

In [105]:
def sort_clusterlabels(dftsne_kmeans, label_count):
    dftsne_times = dftsne_kmeans.groupby(["labels", "cluster"]).size().reset_index(name="Time")
    
    La2_df = dftsne_times[dftsne_times['labels'].str.match('2La')]
    LaV_df = dftsne_times[dftsne_times['labels'].str.match('LaV')]
    La2V_df = dftsne_times[dftsne_times['labels'].str.match('La2V')]
    inter_df = dftsne_times[dftsne_times['labels'].str.match('inter')]
        
    La2_newlabel = La2_df.loc[La2_df['Time'] == La2_df.Time.max(), 'cluster'].values[0]
    LaV_newlabel = LaV_df.loc[LaV_df['Time'] == LaV_df.Time.max(),'cluster'].values[0] 
    La2V_newlabel = La2V_df.loc[La2V_df['Time'] == La2V_df.Time.max(), 'cluster'].values[0]
    inter_newlabel = inter_df.loc[inter_df['Time'] == inter_df.Time.max(), 'cluster'].values[0]
    
    dftsne_kmeans = dftsne_kmeans.replace({'2La': La2_newlabel,
                                           'LaV': LaV_newlabel,
                                           'La2V': La2V_newlabel,
                                           'inter': inter_newlabel})
    return dftsne_kmeans


In [231]:
df = pd.read_pickle(r'C:\Python\Projects\crystal-phase-prediction\pkl_files\structure_labels_La_del_defects_m.pkl')
df

Unnamed: 0,structure,name,labels
0,"[[ 1.43364347 -5.05458306 1.7612222 ] Hf, [ 1...",Hf14La2O32_sc122_No14_1.cif,2La
1,"[[-3.63321709 0.12925241 -9.6815268 ] La, [-1...",Hf14La2O32_sc122_No14_10.cif,2La
2,"[[-3.73360452 0.13446417 -9.63545356] La, [-1...",Hf14La2O32_sc122_No14_11.cif,2La
3,"[[-3.64748426 0.22497602 -4.26243513] Hf, [-3...",Hf14La2O32_sc122_No14_2.cif,2La
4,"[[1.41818387 0.19806804 7.03567622] Hf, [1.419...",Hf14La2O32_sc122_No14_3.cif,2La
5,"[[ -3.75517322 -10.38359532 0.91453015] La, ...",Hf14La2O32_sc122_No14_4.cif,2La
6,"[[ -3.68506745 -10.33083145 0.71110961] La, ...",Hf14La2O32_sc122_No14_5.cif,2La
7,"[[ 1.31045349 10.37168009 1.65995274] La, [3....",Hf14La2O32_sc122_No14_6.cif,2La
8,"[[ -3.74627537 -10.31057474 0.93832042] La, ...",Hf14La2O32_sc122_No14_7.cif,2La
9,"[[-3.73828673 0.08740746 -9.76297697] La, [-1...",Hf14La2O32_sc122_No14_8.cif,2La


In [232]:
xrd = XRDPowderPattern(two_theta_range=(5, 180))
df = xrd.fit_featurize_dataframe(df, 'structure')

HBox(children=(FloatProgress(value=0.0, description='XRDPowderPattern', max=109.0, style=ProgressStyle(descrip…




In [205]:
X = df.iloc[:, 3:]

In [206]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [207]:
labels_true = df['labels']

In [208]:
perplexity = list(range(1, 80, 1))  # perplexity values to test
labels_count = 4  # select number of lables

performance = []
for perplex in perplexity:
    dftsne = tsne_algo(X, perplex, labels_true)
    dftsne_kmeans = kmeans_tsne(dftsne, 4)
    dftsne_kmeans = sort_clusterlabels(dftsne_kmeans, labels_count)
    acc = accuracy_score(dftsne_kmeans['labels'], dftsne_kmeans['cluster'])
    performance.append(acc)
    
perf_dic = dict(zip(perplexity, performance))
print('Best value of performance: '
      + str(max(perf_dic.values()))
      + ' Perplexity = '
      + str(max(perf_dic, key=perf_dic.get)))

print(perf_dic)

Best value of performance: 0.9724770642201835 Perplexity = 3
{1: 0.5963302752293578, 2: 0.9357798165137615, 3: 0.9724770642201835, 4: 0.9174311926605505, 5: 0.4954128440366973, 6: 0.5412844036697247, 7: 0.926605504587156, 8: 0.5321100917431193, 9: 0.5137614678899083, 10: 0.9357798165137615, 11: 0.5229357798165137, 12: 0.42201834862385323, 13: 0.5596330275229358, 14: 0.5504587155963303, 15: 0.46788990825688076, 16: 0.44036697247706424, 17: 0.48623853211009177, 18: 0.4954128440366973, 19: 0.48623853211009177, 20: 0.41284403669724773, 21: 0.5412844036697247, 22: 0.47706422018348627, 23: 0.47706422018348627, 24: 0.47706422018348627, 25: 0.47706422018348627, 26: 0.43119266055045874, 27: 0.4954128440366973, 28: 0.4954128440366973, 29: 0.47706422018348627, 30: 0.43119266055045874, 31: 0.46788990825688076, 32: 0.5137614678899083, 33: 0.5045871559633027, 34: 0.46788990825688076, 35: 0.41284403669724773, 36: 0.44036697247706424, 37: 0.44036697247706424, 38: 0.5137614678899083, 39: 0.440366972477

In [209]:
perplex = 3
dftsne = tsne_algo(X, perplex, labels_true)
dftsne_kmeans = kmeans_tsne(dftsne, 4)

In [210]:
fig = px.scatter_3d(dftsne_kmeans, x='x1', y='x2', z='x3', color=dftsne_kmeans['labels'], 
                    title='TSNE 3D Ground Truth')
fig.show()

In [211]:
fig = px.scatter_3d(dftsne_kmeans, x='x1', y='x2', z='x3', color=dftsne_kmeans['cluster'], 
                    title='TSNE 3D Ground Truth')
fig.show()


In [212]:
from sklearn.metrics import accuracy_score
components = list(range(3, 50 , 1))
performance = []
i = 1
for component in components: 
    W_df = nmf_algo(X, component)
    W_df_kmeans = kmeans_nmf(W_df, 4)
    W_df_kmeans = sort_clusterlabels(W_df_kmeans, 4)
    acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
    performance.append(acc)
    print("Round: " + str(i))
    i = i + 1
       
perf_dic = dict(zip(components, performance))
print('Best value of performance: ' + str(max(perf_dic.values())) + ' Components = ' + str(max(perf_dic, key=perf_dic.get)))

Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Best value of performance: 0.7889908256880734 Components = 8


In [213]:
components = 8
W_df = nmf_algo(X, components)
W_df_kmeans = kmeans_nmf(W_df, 4)

names = []
for i in range(1, components +1): 
    names.append( 'x' + str(i))

signs = ['cluster'] 
labels = ['labels']
W_df_kmeans.columns = names + signs + labels
W_df_kmeans

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,cluster,labels
0,0.112139,0.0,0.0,0.290932,0.790982,0.0,0.511665,0.439341,1,2La
1,0.292618,0.0,0.320682,0.241507,0.689472,0.0,0.311171,0.372162,1,2La
2,0.332021,0.0,0.133367,0.322447,0.752083,0.0,0.251649,0.353208,1,2La
3,0.12316,0.230713,0.0,0.572436,0.517846,0.089142,0.292823,0.284681,1,2La
4,0.218736,0.186588,0.179975,0.57365,0.726364,0.099663,0.017251,0.097061,1,2La
5,0.093375,0.0,0.126259,0.611408,0.539627,0.013477,0.387526,0.307441,1,2La
6,0.206099,0.0,0.200192,0.258141,0.791867,0.03602,0.184055,0.483717,1,2La
7,0.394947,0.069888,0.279714,0.651559,0.384785,0.097441,0.0,0.228496,1,2La
8,0.402574,0.0,0.073289,0.585785,0.633579,0.114335,0.0,0.245619,1,2La
9,0.325835,0.685025,0.544804,0.0,0.772205,0.022111,0.0,0.090403,1,2La


In [214]:
fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['cluster'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Prediction'
)
fig.show()

In [215]:
import plotly.express as px
fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['labels'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Ground Truth'
)
fig.show()

In [None]:
W_df_kmeans = sort_clusterlabels(W_df_kmeans, 4)
acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
print('Acc : ' + str(acc))

In [56]:
df['cluster'] = W_df_kmeans['cluster']
df

Unnamed: 0,structure,name,cluster
0,"[[ 1.43364347 -5.05458306 1.7612222 ] Hf, [ 1...",Hf14La2O32_sc122_No14_1.cif,1
1,"[[-3.63321709 0.12925241 -9.6815268 ] La, [-1...",Hf14La2O32_sc122_No14_10.cif,1
2,"[[-3.73360452 0.13446417 -9.63545356] La, [-1...",Hf14La2O32_sc122_No14_11.cif,1
3,"[[-3.64748426 0.22497602 -4.26243513] Hf, [-3...",Hf14La2O32_sc122_No14_2.cif,1
4,"[[1.41818387 0.19806804 7.03567622] Hf, [1.419...",Hf14La2O32_sc122_No14_3.cif,1
5,"[[ -3.75517322 -10.38359532 0.91453015] La, ...",Hf14La2O32_sc122_No14_4.cif,1
6,"[[ -3.68506745 -10.33083145 0.71110961] La, ...",Hf14La2O32_sc122_No14_5.cif,1
7,"[[ 1.31045349 10.37168009 1.65995274] La, [3....",Hf14La2O32_sc122_No14_6.cif,1
8,"[[ -3.74627537 -10.31057474 0.93832042] La, ...",Hf14La2O32_sc122_No14_7.cif,1
9,"[[-3.73828673 0.08740746 -9.76297697] La, [-1...",Hf14La2O32_sc122_No14_8.cif,1


In [216]:
df = pd.read_pickle(r"C:/Python/Projects/crystal-phase-prediction/pkl_files/structure_labels_La_odel_defects_m.pkl")
df

Unnamed: 0,structure,name,labels
0,"[[ 1.43364347 -5.05458306 1.7612222 ] Hf, [ 1...",Hf14La2O32_sc122_No14_1.cif,2La
1,"[[-3.63321709 0.12925241 -9.6815268 ] La, [-1...",Hf14La2O32_sc122_No14_10.cif,2La
2,"[[-3.73360452 0.13446417 -9.63545356] La, [-1...",Hf14La2O32_sc122_No14_11.cif,2La
3,"[[-3.64748426 0.22497602 -4.26243513] Hf, [-3...",Hf14La2O32_sc122_No14_2.cif,2La
4,"[[1.41818387 0.19806804 7.03567622] Hf, [1.419...",Hf14La2O32_sc122_No14_3.cif,2La
5,"[[ -3.75517322 -10.38359532 0.91453015] La, ...",Hf14La2O32_sc122_No14_4.cif,2La
6,"[[ -3.68506745 -10.33083145 0.71110961] La, ...",Hf14La2O32_sc122_No14_5.cif,2La
7,"[[ 1.31045349 10.37168009 1.65995274] La, [3....",Hf14La2O32_sc122_No14_6.cif,2La
8,"[[ -3.74627537 -10.31057474 0.93832042] La, ...",Hf14La2O32_sc122_No14_7.cif,2La
9,"[[-3.73828673 0.08740746 -9.76297697] La, [-1...",Hf14La2O32_sc122_No14_8.cif,2La


In [217]:
# PRDF Test
from matminer.featurizers.structure import PartialRadialDistributionFunction

prdf = PartialRadialDistributionFunction(cutoff = 15, bin_size = 0.5, include_elems=['O', 'La'], exclude_elems=['Hf'])
df = prdf.fit_featurize_dataframe(df, 'structure')
df

HBox(children=(FloatProgress(value=0.0, description='PartialRadialDistributionFunction', max=109.0, style=Prog…




Unnamed: 0,structure,name,labels,La-La PRDF r=0.00-0.50,La-La PRDF r=0.50-1.00,La-La PRDF r=1.00-1.50,La-La PRDF r=1.50-2.00,La-La PRDF r=2.00-2.50,La-La PRDF r=2.50-3.00,La-La PRDF r=3.00-3.50,...,O-O PRDF r=10.00-10.50,O-O PRDF r=10.50-11.00,O-O PRDF r=11.00-11.50,O-O PRDF r=11.50-12.00,O-O PRDF r=12.00-12.50,O-O PRDF r=12.50-13.00,O-O PRDF r=13.00-13.50,O-O PRDF r=13.50-14.00,O-O PRDF r=14.00-14.50,O-O PRDF r=14.50-15.00
0,"[[ 1.43364347 -5.05458306 1.7612222 ] Hf, [ 1...",Hf14La2O32_sc122_No14_1.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.067398,0.054218,0.05485,0.057342,0.058457,0.056899,0.061751,0.056606,0.053585,0.059157
1,"[[-3.63321709 0.12925241 -9.6815268 ] La, [-1...",Hf14La2O32_sc122_No14_10.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.015038,...,0.061624,0.05921,0.055322,0.056333,0.056402,0.057634,0.062261,0.058236,0.052556,0.057923
2,"[[-3.73360452 0.13446417 -9.63545356] La, [-1...",Hf14La2O32_sc122_No14_11.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062381,0.05723,0.058465,0.055541,0.055408,0.057389,0.062034,0.057658,0.053438,0.057329
3,"[[-3.64748426 0.22497602 -4.26243513] Hf, [-3...",Hf14La2O32_sc122_No14_2.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062854,0.056112,0.058151,0.054316,0.059252,0.057511,0.062884,0.055869,0.053781,0.058517
4,"[[1.41818387 0.19806804 7.03567622] Hf, [1.419...",Hf14La2O32_sc122_No14_3.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06115,0.057489,0.058622,0.054893,0.056071,0.057022,0.062204,0.056921,0.055054,0.05678
5,"[[ -3.75517322 -10.38359532 0.91453015] La, ...",Hf14La2O32_sc122_No14_4.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.066262,0.055767,0.055479,0.055181,0.059252,0.057634,0.060448,0.057605,0.054173,0.058014
6,"[[ -3.68506745 -10.33083145 0.71110961] La, ...",Hf14La2O32_sc122_No14_5.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06134,0.058521,0.057679,0.055541,0.056535,0.057266,0.060391,0.057132,0.055642,0.056872
7,"[[ 1.31045349 10.37168009 1.65995274] La, [3....",Hf14La2O32_sc122_No14_6.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.060488,0.055853,0.060351,0.055325,0.057264,0.055859,0.063111,0.056921,0.054614,0.056323
8,"[[ -3.74627537 -10.31057474 0.93832042] La, ...",Hf14La2O32_sc122_No14_7.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.061718,0.055767,0.058779,0.057342,0.054944,0.056777,0.062714,0.057237,0.054614,0.056231
9,"[[-3.73828673 0.08740746 -9.76297697] La, [-1...",Hf14La2O32_sc122_No14_8.cif,2La,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.059541,0.054993,0.064752,0.051939,0.050636,0.061304,0.059881,0.055869,0.055103,0.056231


In [218]:
X = df.iloc[:, 3:]
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [227]:
perplex = 5
dftsne = tsne_algo(X, perplex, labels_true)
dftsne_kmeans = kmeans_tsne(dftsne, 3)

In [228]:
import plotly.express as px
dftsne_kmeans3d = dftsne_kmeans[['x1', 'x2', 'x3' ,'cluster']]
fig = px.scatter_3d(
    dftsne_kmeans3d, x='x1', y='x2', z='x3', color=dftsne_kmeans3d['cluster'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='TSNE 3D Prediction Kmeans'
)
fig.show()

In [229]:
import plotly.express as px
dftsne_kmeans3d = dftsne_kmeans[['x1', 'x2', 'x3' ,'labels']]
fig = px.scatter_3d(
    dftsne_kmeans3d, x='x1', y='x2', z='x3', color=dftsne_kmeans3d['labels'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='TSNE 3D Ground Truth'
)
fig.show()

In [224]:
components = 12
W_df = nmf_algo(X, components)
W_df_kmeans = kmeans_nmf(W_df, 4)

names = []
for i in range(1, components +1): 
    names.append( 'x' + str(i))

signs = ['cluster'] 
labels = ['labels']
W_df_kmeans.columns = names + signs + labels
W_df_kmeans

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,cluster,labels
0,0.194903,0.050811,0.210274,0.0,0.0,0.0,0.578452,0.593877,0.010058,0.0,0.169446,1.129948,3,2La
1,0.477806,0.361143,0.129147,0.0,0.0,0.49326,0.280041,0.102561,0.06097,0.067725,0.441869,0.188973,3,2La
2,0.392736,0.0,0.115871,0.039719,0.176398,0.307254,0.00415,0.083382,0.06779,0.0,0.438221,0.577714,3,2La
3,0.231157,0.581233,0.005665,0.0,0.0,0.018389,0.365113,0.0,0.114935,0.002439,0.0,1.211896,3,2La
4,0.403356,0.0,0.156813,0.056244,0.111738,0.0,0.0616,0.086124,0.0,0.097803,0.0,1.25387,3,2La
5,0.187923,0.0,0.443962,0.009457,0.100217,0.360197,0.626302,0.7288,0.066589,0.0,0.0,0.104704,1,2La
6,0.29863,0.592529,0.14887,0.041512,0.0,0.344505,0.178911,0.963403,0.081129,0.092714,0.0,0.229613,1,2La
7,0.45954,0.020978,0.0,0.008368,0.0,0.712349,0.271908,0.0,0.108336,0.145772,0.157567,0.234921,3,2La
8,0.293883,0.0,0.106777,0.151117,0.0,0.0,0.439967,0.282243,0.001361,0.134173,0.0,0.858617,3,2La
9,0.084846,0.0,0.014374,0.354654,0.044183,0.081874,0.343285,0.107073,0.099434,0.182041,0.021263,0.0,3,2La


In [225]:
import plotly.express as px
fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['cluster'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Prediction'
)
fig.show()

In [226]:

fig = px.scatter_3d(
    W_df_kmeans, x='x1', y='x2', z='x3', color=W_df_kmeans['labels'],
    labels={'x1': 'PC 1', 'x2': 'PC 2', 'x3': 'PC 3'}, title='NMF 3D Ground Truth'
)
fig.show()

In [201]:
from sklearn.metrics import accuracy_score
components = list(range(3, 50 , 1))
performance = []
i = 1
for component in components: 
    W_df = nmf_algo(X, component)
    W_df_kmeans = kmeans_nmf(W_df, 4)
    W_df_kmeans = sort_clusterlabels(W_df_kmeans, 4)
    acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
    performance.append(acc)
    print("Round: " + str(i))
    i = i + 1
       
perf_dic = dict(zip(components, performance))
print('Best value of performance: ' + str(max(perf_dic.values())) + ' Components = ' + str(max(perf_dic, key=perf_dic.get)))

Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Round: 40
Round: 41
Round: 42
Round: 43
Round: 44
Round: 45
Round: 46
Round: 47
Best value of performance: 0.7798165137614679 Components = 17
