In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from matminer.featurizers.structure import XRDPowderPattern
import plotly.express as px

In [1]:

def tsne_algo(X, perplexity, labels_true):
    
    from sklearn.manifold import TSNE
    n_components = 3
    Xtsne = TSNE(n_components, perplexity=perplexity).fit_transform(X)
    dftsne = pd.DataFrame(Xtsne)
    dftsne['labels'] = labels_true
    return dftsne


In [36]:
def kmeans_algo(dftsne):
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4)
    tsne_labels = kmeans.fit_predict(dftsne[[0, 1, 2]])
    dftsne['cluster'] = tsne_labels
    dftsne.columns = ['x1', 'x2', 'x3', 'labels', 'cluster']
    return dftsne


In [3]:
def sort_clusterlabels(dftsne_kmeans, label_count):
    dftsne_times = dftsne_kmeans.groupby(["labels", "cluster"]).size().reset_index(name="Time")
    
    m_df = dftsne_times[dftsne_times['labels'].str.match('m')]
    po_df = dftsne_times[dftsne_times['labels'].str.match('p-o')]
    o_df = dftsne_times[dftsne_times['labels'].str.match('o')]
    t_df = dftsne_times[dftsne_times['labels'].str.match('t')]
    
    if(label_count == 5):
        unknown_df = dftsne_times[dftsne_times['labels'].str.match('unknown')]
        unk_newlabel = unknown_df.loc[unknown_df['Time'] == unknown_df.Time.max(), 'cluster'].values[0]
        dftsne_kmeans = dftsne_kmeans.replace({'unknown': unk_newlabel})
        
    m_newlabel = m_df.loc[m_df['Time'] == m_df.Time.max(), 'cluster'].values[0]
    po_newlabel = po_df.loc[po_df['Time'] == po_df.Time.max(),'cluster'].values[0] 
    o_newlabel = o_df.loc[o_df['Time'] == o_df.Time.max(), 'cluster'].values[0]
    t_newlabel = t_df.loc[t_df['Time'] == t_df.Time.max(), 'cluster'].values[0]
    
    dftsne_kmeans = dftsne_kmeans.replace({'m': m_newlabel,
                                           'p-o': po_newlabel,
                                           'o': o_newlabel,
                                           't': t_newlabel})
    return dftsne_kmeans


In [30]:
df = pd.read_pickle(r"C:\Python\Projects\crystal-phase-prediction"
                    r"\pkl_files\structure_df_hfo2_La_del_ang.pkl")

df_labels = pd.read_pickle(r"C:\Python\Projects\crystal-phase-prediction"
                    r"\pkl_files\structure_labels_La_defects.pkl")

dotants = pd.read_pickle(r'C:\Python\Projects\crystal-phase-prediction'
                         r'\pkl_files\probe_data.pkl')
dot = dotants['dopant']
dot = np.unique(dot)
dot = dot.tolist()
dot.remove('Hf')
dot.remove('La')

labels_true = df_labels['labels_0_3']
#df = df.drop(columns=['labels_0_3'])


In [31]:
xrd = XRDPowderPattern(two_theta_range=(5, 65))
df = xrd.fit_featurize_dataframe(df, 'structure')

X = df.iloc[:, 2:]

HBox(children=(FloatProgress(value=0.0, description='XRDPowderPattern', max=339.0, style=ProgressStyle(descrip…




In [32]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [37]:
perplexity = list(range(1, 80, 1))  # perplexity values to test
labels_count = 4  # select number of lables

performance = []
for perplex in perplexity:
    dftsne = tsne_algo(X, perplex, labels_true)
    dftsne_kmeans = kmeans_algo(dftsne)
    dftsne_kmeans = sort_clusterlabels(dftsne_kmeans, labels_count)
    acc = accuracy_score(dftsne_kmeans['labels'], dftsne_kmeans['cluster'])
    performance.append(acc)
    
perf_dic = dict(zip(perplexity, performance))
print('Best value of performance: '
      + str(max(perf_dic.values()))
      + ' Perplexity = '
      + str(max(perf_dic, key=perf_dic.get)))

print(perf_dic)

Best value of performance: 0.7905604719764012 Perplexity = 24
{1: 0.31563421828908556, 2: 0.32448377581120946, 3: 0.41887905604719766, 4: 0.44837758112094395, 5: 0.5073746312684366, 6: 0.504424778761062, 7: 0.6696165191740413, 8: 0.5486725663716814, 9: 0.5781710914454278, 10: 0.616519174041298, 11: 0.5486725663716814, 12: 0.6460176991150443, 13: 0.5722713864306784, 14: 0.5811209439528023, 15: 0.6519174041297935, 16: 0.6607669616519174, 17: 0.640117994100295, 18: 0.6460176991150443, 19: 0.6489675516224189, 20: 0.6873156342182891, 21: 0.6843657817109144, 22: 0.6342182890855457, 23: 0.7050147492625368, 24: 0.7905604719764012, 25: 0.7109144542772862, 26: 0.6519174041297935, 27: 0.5870206489675516, 28: 0.696165191740413, 29: 0.7138643067846607, 30: 0.6460176991150443, 31: 0.5870206489675516, 32: 0.584070796460177, 33: 0.6696165191740413, 34: 0.5132743362831859, 35: 0.6873156342182891, 36: 0.6371681415929203, 37: 0.6342182890855457, 38: 0.7227138643067846, 39: 0.6047197640117994, 40: 0.67846

In [40]:
fig = px.scatter_3d(dftsne, x='x1', y='x2', z='x3', color=dftsne['labels'], 
                    title='TSNE 3D Ground Truth')
fig.show()


In [41]:
fig = px.scatter_3d(dftsne, x='x1', y='x2', z='x3', color=dftsne['cluster'], 
                    title='TSNE 3D Ground Truth')
fig.show()
