In [None]:
#%matplotlib notebook

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [13, 8]
import pathlib
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.cluster import OPTICS
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from matminer.featurizers.structure import XRDPowderPattern

from algorithms import pca_algo, kmeans_algo, nmf_algo, tsne_algo
from val_sort import sort_clusterlabels, name_change, plot2d, plot3d, hyperparameter_testing

In [6]:
pathfile = r'C:\Python\Projects\crystal-phase-prediction\data\pkl_files\descriptors\df_xrd_hfo2_del_5_65.pkl'
path = pathlib.WindowsPath(pathfile)
df = pd.read_pickle(path)
df

Unnamed: 0,structure,name
0,"[[1.38867986 2.3864784 4.36482979] Hf, [3.587...",Ag_HfO2_cat_3.125_222_m.cif
1,"[[1.40181386 2.3944349 4.3885007 ] Hf, [3.606...",Ag_HfO2_cat_3.125_222_o.cif
2,"[[-8.57055872 -7.98390112 -6.2853237 ] Hf, [-6...",Ag_HfO2_cat_3.125_222_p-o.cif
3,"[[-8.73105937 2.58337834 3.5977214 ] Hf, [-6...",Ag_HfO2_cat_3.125_222_t.cif
4,"[[1.37820795 2.39009891 4.37039843] Hf, [3.622...",Ag_HfO2_cat_6.25_122_m.cif
...,...,...
1720,"[[ 1.26076304 -5.11876687 1.26075093] Hf, [ 1...",Zr_HfO2_inter_6.25_212_t.cif
1721,"[[ 1.2262443 -10.12402918 -3.43749088] Hf, ...",Zr_HfO2_inter_6.25_221_m.cif
1722,"[[1.39997305 0.20795941 1.85144277] Hf, [1.451...",Zr_HfO2_inter_6.25_221_o.cif
1723,"[[-8.97250742 -0.20890362 1.50328354] Hf, [-8...",Zr_HfO2_inter_6.25_221_p-o.cif


In [None]:
labels_true = pd.read_pickle(r'C:\Python\Projects\crystal-phase-prediction\data\data_labels\labels_hfo2.pkl')
labels_true = labels_true.labels_0_4
labels_true = labels_true.values


In [None]:
X = df.iloc[:,2:]

In [None]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X

In [None]:
wcss=[]

#we always assume the max number of cluster would be 10
#you can judge the number of clusters by doing averaging
###Static code to get max no of clusters

for i in range(1,11):
    kmeans = KMeans(n_clusters= i, init='k-means++', random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

    #inertia_ is the formula used to segregate the data points into clusters
    
plt.plot(range(1,11), wcss)
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show()

In [None]:
# PCA Testing

In [None]:
mPCA = PCA(n_components=50)
PrincipleComponents = mPCA.fit_transform(X)
variance = mPCA.explained_variance_ratio_
variance_ratio = np.cumsum(np.round(variance, decimals=3)*100)
print(variance_ratio)
plt.title("PCA components VS percentage of variance explained")
plt.ylabel("Percentage (%)")
plt.xlabel("# of components")
plt.plot(variance_ratio)
plt.show()

In [None]:
comp = 30 # change principal components

dfpca = pca_algo(X, comp)
dfpca_kmeans = kmeans_algo(dfpca, 5)

In [None]:
metrics.silhouette_score(dfpca.iloc[:,:-1], dfpca_kmeans['cluster'], metric='l2')

In [None]:
dfpca_kmeans['labels'] = labels_true

In [None]:
dfpca_kmeans3 = name_change(dfpca_kmeans)

In [None]:
plot2d(dfpca_kmeans3, "2D PCA Predicted Clusters", 'cluster')

In [None]:
plot2d(dfpca_kmeans3, "2D PCA Ground Truth", 'labels')

In [None]:
plot3d(dfpca_kmeans3, "3D PCA Ground Truth", 'labels')

In [None]:
plot3d(dfpca_kmeans3, "3D PCA Prediction", 'cluster')

In [None]:
dfpca_kmeans = sort_clusterlabels(dfpca_kmeans)
acc = accuracy_score(dfpca_kmeans['labels'], dfpca_kmeans['cluster'])
print(acc)

In [None]:
confusion_matrix(dfpca_kmeans['labels'], dfpca_kmeans['cluster'])

In [None]:
comp = list(range(1, 40 , 1)) # here pca comp
hyperparameter_testing(X, 'pca', comp, labels_true)

In [None]:
# TSNE Testing

In [None]:
perplex = 20 # change value here

dftsne = tsne_algo(X, perplex)
dftsne_kmeans = kmeans_algo(dftsne[[0, 1, 2]], n_cluster = 5)

In [None]:
dftsne_kmeans['labels'] = labels_true

In [None]:
metrics.silhouette_score(dftsne, dftsne_kmeans['cluster'], metric='l2')

In [None]:
dftsne_kmeans3 = name_change(dftsne_kmeans)

In [None]:
plot2d(dftsne_kmeans3,"2D TSNE Predicted Clusters representation",'cluster')

In [None]:
plot2d(dftsne_kmeans3,"2D TSNE Ground Truth representation",'labels')

In [None]:
plot3d(dftsne_kmeans3, 'TSNE 3D Ground Truth', 'labels')

In [None]:
plot3d(dftsne_kmeans3, 'TSNE 3D Prediction', 'cluster')

In [None]:
dftsne_kmeans = sort_clusterlabels(dftsne_kmeans)
acc = accuracy_score(dftsne_kmeans['labels'], dftsne_kmeans['cluster'])
print(acc)

In [None]:
confusion_matrix(dftsne_kmeans['labels'], dftsne_kmeans['cluster'])

In [None]:
#Hyperparameter Testing TSNE

In [None]:
perplexity = list(range(20, 25, 1))
hyperparameter_testing(X,'tsne', perplexity, labels_true)

In [None]:
# NMF Testing 

In [None]:
component = 8
W_df = nmf_algo(X, component)
W_df_kmeans = kmeans_algo(W_df, 5) # calculate clusterlabels and asign labels to W_df_kmeans 

In [None]:
W_df_kmeans['labels'] = labels_true

In [None]:
metrics.silhouette_score(W_df.iloc[:,:-1], W_df_kmeans['cluster'], metric='l2')

In [None]:
# change names for plotly bib
W_df_kmeans3 = name_change(W_df_kmeans)

In [None]:
plot3d(W_df_kmeans3,title='NMF 3D Prediction Kmeans', clus_lab = 'cluster' )

In [None]:
plot3d(W_df_kmeans3,title='NMF 3D Ground Truth', clus_lab = 'labels' )

In [None]:
W_df_kmeans = sort_clusterlabels(W_df_kmeans)
acc = accuracy_score(W_df_kmeans['labels'], W_df_kmeans['cluster'])
print(acc)

In [None]:
confusion_matrix(W_df_kmeans['labels'], W_df_kmeans['cluster'])

In [None]:
# Hyperparameter components testing

In [None]:
components = list(range(2, 10 , 2))
hyperparameter_testing(X,'nmf', components, labels_true)

In [None]:
# Testing Alternative Algorithm Methodes OPTICS/DBSCAN

In [None]:
component = 8
W_df = nmf_algo(X, component)
optics = OPTICS(min_samples=60).fit(W_df)
W_df['cluster'] = optics.labels_
W_df['labels'] = labels_true

In [None]:
W_df = name_change(W_df)
plot3d(W_df, title='optics 3D Prediction', clus_lab = 'cluster')

In [None]:
min_samples =list(range(20:30))
hyperparameter_testing(X,'nmf', min_samples, labels_true)