In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from comtrade import Comtrade
import seaborn as sns

##### Functions to be used

In [None]:
def data_frames_creation(paths):
    temp=[]
    for i in range (0, len(paths)):
        rec = Comtrade()
        rec.load(paths[i][0],paths[i][1])
        samples_list=[]
        for j in range (0, len(rec.analog)):
            samples_list.append(rec.analog[j])
        abc=pd.DataFrame(samples_list)
        abc=abc.transpose()
        abc.columns=rec.analog_channel_ids
        time=pd.Series(data=rec.time,name='Time')
        abc=pd.concat([time,abc],axis=1)
        temp.append(abc)
        samples_list=[]
    return temp

In [None]:
import os
def getPath(file_id, file_type):
    response = ''
    directory = "C:\\Users\\Mahnoor Saeed\\Dissertation_Notebook\\cable_failure_dataset\\DAT_Files"
    if file_type == 2:
        directory = "C:\\Users\\Mahnoor Saeed\\Dissertation_Notebook\\cable_failure_dataset\\CFG_FILES"
    for path in os.listdir(directory):
        full_path = os.path.join(directory, path)
        if os.path.isfile(full_path):
            if str(file_id) in full_path:
                response = full_path
    return response

In [None]:
def single_point_visula(cols,data):
    plt.figure(figsize=(11,2))
    plt.title('Currents Amplitude Vs Time of '+cols)
    plt.plot(data['Time'],data[cols],color='#0FE279')
    plt.xlabel('Time', fontsize=8)
    plt.ylabel('Current (amp)', fontsize=7)

In [None]:
def box_plots(data,b):
    plt.figure(figsize=(11,3))
    plt.title('Distribution of Data Point '+ str (b))
    sns.boxplot(data=data,color='#B0FC21')
    plt.xlabel('Phases', fontsize=9)
    plt.ylabel('Current (amp)', fontsize=10)

In [None]:
def pred_fal_good_Visualization(dataframe, wrong_phases,columns):
    for f in range (0,len(dataframe)):
        if wrong_phases[f]=='A':
            plt.figure(figsize=(15,3))
            plt.title('Failure Sample '+str(f)+' Amplitude Vs Time of '+columns[0])
            plt.plot(dataframe[f]['Time'],dataframe[f]['MPI3p1:I A'],color='#E9967A')
        if wrong_phases[f]=='B':
            plt.figure(figsize=(15,3))
            plt.title('Failure Sample '+str(f)+' Amplitude Vs Time of '+columns[1])
            plt.plot(dataframe[f]['Time'],dataframe[f]['MPI3p1:I B'],color='#108A0D')
        if wrong_phases[f]=='C':
            plt.figure(figsize=(15,3))
            plt.title('Failure Sample '+str(f)+' Amplitude Vs Time of '+columns[2])
            plt.plot(dataframe[f]['Time'],dataframe[f]['MPI3p1:I C'],color='#0282B9')
        if wrong_phases[f]=='N':
            plt.figure(figsize=(15,3))
            plt.title('Failure Sample '+str(f)+' Amplitude Vs Time of '+columns[3])
            plt.plot(dataframe[f]['Time'],dataframe[f]['MPI3p1:IN'],color='#C794F2')

###### `********************************************************************************************************************`

##### Data Loading

In [None]:
ten_PCA=pd.read_csv('Ten_com_clustering.csv')

In [None]:
ten_PCA

In [None]:
ten_PCA.isna().sum()

##### Shuffling and Splits

In [None]:
np.random.seed(35)
ten_PCA_shuffle=ten_PCA.sample(frac=1)

In [None]:
ten_PCA_shuffle

In [None]:
engin_df=pd.read_csv('eng_features.csv')
engin_df.head()

In [None]:
np.random.seed(35)
engin_df_shuffle=engin_df.sample(frac=1)
engin_df_shuffle

In [None]:
train_set_len=round(len(ten_PCA_shuffle)*0.5)

In [None]:
test_set_len=round(len(ten_PCA_shuffle)*0.5)

In [None]:
print(train_set_len)
print(test_set_len)

In [None]:
train_set=ten_PCA_shuffle.iloc[0:686,:]

In [None]:
test_set=ten_PCA_shuffle.iloc[686:,:]

In [None]:
test_set

In [None]:
train_set_engin_df=engin_df_shuffle.iloc[0:686,:]

In [None]:
test_set_engin_df=engin_df_shuffle.iloc[686:,:]

In [None]:
train_set_engin_df

In [None]:
test_set_engin_df

#### GMM-Diagonal

In [None]:
train_set

In [None]:
np.random.seed(35)
from sklearn.mixture import GaussianMixture
gmm_diag_train = GaussianMixture(n_components=2,covariance_type='diag').fit(train_set)

In [None]:
gmm_diag_train.get_params()

In [None]:
np.random.seed(35)
gmm_dia_train_labels=gmm_diag_train.predict(train_set)

In [None]:
gmm_dia_train_labels

In [None]:
len(gmm_dia_train_labels)

In [None]:
gmm_train_lab_df=pd.DataFrame(gmm_dia_train_labels,columns=['Train_labels'])

In [None]:
gmm_train_lab_df

In [None]:
train_set_reset=train_set.reset_index(drop=True)
train_set_reset

In [None]:
from sklearn.metrics import silhouette_score
print('Silhoutte score of GMM clustering is ' + str(silhouette_score(train_set, gmm_dia_train_labels)))

In [None]:
gmm_dia_train_df = pd.concat([train_set_reset, gmm_train_lab_df], axis=1)

In [None]:
gmm_dia_train_df

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
gmm_dia_train_df['Train_labels'].value_counts()

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(gmm_dia_train_df.iloc[:,3],gmm_dia_train_df.iloc[:,4], hue=gmm_dia_train_df['Train_labels'], palette='magma', s=70, alpha=0.8).set_title('Distribution of cluster Based on Gaussian Mixture Model (Train set)',fontsize=13)
plt.legend()
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

###### `********************************************************************************************************************`

## Predicted Failure Verification by GMM Diagonal (Train Set)

In [None]:
gmm_dia_train_df

In [None]:
pred_fal_gmm_dia_train=gmm_dia_train_df[gmm_dia_train_df['Train_labels']==1]

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(pred_fal_gmm_dia_train.iloc[:,3],pred_fal_gmm_dia_train.iloc[:,4],color='#08BF4B' , s=70, alpha=0.8).set_title('Only Predicted Failure cluster by GMM on Train set',fontsize=13)
plt.legend(labels=['Predicted Failure Cluster Only (Train Set)'])
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

In [None]:
pred_fal_gmm_dia_train

In [None]:
predicted_failure_gmm_index_train=pred_fal_gmm_dia_train.index

In [None]:
predicted_failure_gmm_index_train

In [None]:
train_set_engin_df_reset=train_set_engin_df.reset_index(drop=True)
train_set_engin_df_reset

In [None]:
Faulty_pred_gmm_ids_train=[]
for i in predicted_failure_gmm_index_train:
    Faulty_pred_gmm_ids_train.append(train_set_engin_df_reset.loc[i])
Faulty_pred_gmm_ids_train=pd.DataFrame(Faulty_pred_gmm_ids_train)

In [None]:
Faulty_pred_gmm_ids_train

In [None]:
len(Faulty_pred_gmm_ids_train)

In [None]:
gmm_dia_failure_paths_train=[]
for index, row in Faulty_pred_gmm_ids_train.iterrows():
    dAT_path = getPath(row['Fault ID'], 1)
    CFG_Path = getPath(row['Fault ID'], 2)
    gmm_dia_failure_paths_train.append([CFG_Path, dAT_path])

In [None]:
gmm_dia_failure_paths_train

In [None]:
gmm_dia_faulty_phases_train=[]
for index, row in Faulty_pred_gmm_ids_train.iterrows():
    gmm_dia_faulty_phases_train.append(row['Phase'])

In [None]:
print(len(gmm_dia_failure_paths_train))
print(len(gmm_dia_faulty_phases_train))

In [None]:
gmm_dia_fal_traindata=data_frames_creation(gmm_dia_failure_paths_train)

In [None]:
col_names=[ 'MPI3p1:I A', 'MPI3p1:I B', 'MPI3p1:I C', 'MPI3p1:IN']

In [None]:
pred_fal_good_Visualization(gmm_dia_fal_traindata,gmm_dia_faulty_phases_train,col_names)

In [None]:
false_failure_trainset=[5,38,67,86,136]

In [None]:
len(false_failure_trainset)

In [None]:
pred_fal_gmm_dia_train_reset=pred_fal_gmm_dia_train.reset_index(drop=True)

In [None]:
false_neg_train=pred_fal_gmm_dia_train_reset.iloc[false_failure_trainset]
false_neg_train

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(pred_fal_gmm_dia_train.iloc[:,3],pred_fal_gmm_dia_train.iloc[:,4],color='#B2FF33', s=70, alpha=0.8).set_title('Predicted Failure cluster based on GMM (Train set)',fontsize=13)
sns.scatterplot(false_neg_train.iloc[:,3],false_neg_train.iloc[:,4], color='#FF6E33' ,s=70, alpha=0.8)
#plt.legend()
plt.legend(labels=["True Failure","False Failure"])
#plt.savefig('failure_only')
plt.show()

## Predicted Good Verification (Trian Set)

In [None]:
pred_good_gmm_dia_train=gmm_dia_train_df[gmm_dia_train_df['Train_labels']==0]

In [None]:
pred_good_gmm_dia_train.head()

In [None]:
len(pred_good_gmm_dia_train)

In [None]:
plt.figure(figsize = (17,7))
sns.scatterplot(pred_good_gmm_dia_train.iloc[:,3],pred_good_gmm_dia_train.iloc[:,4], color='#D133FF',  s=70, alpha=0.8).set_title(' Only Predicted Good cluster based GMM (Train Set)',fontsize=13)
plt.legend(labels=['Predicted Good Cluster Only'],loc='upper left')
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

In [None]:
pred_good_gmm_dia_train_index=pred_good_gmm_dia_train.index
pred_good_gmm_dia_train_index

In [None]:
len(pred_good_gmm_dia_train_index)

In [None]:
train_set_engin_df_reset

In [None]:
pred_good_gmm_dia_train_ids=[]
for i in pred_good_gmm_dia_train_index:
    pred_good_gmm_dia_train_ids.append(train_set_engin_df_reset.loc[i])
pred_good_gmm_dia_train_ids=pd.DataFrame(pred_good_gmm_dia_train_ids)

In [None]:
len(pred_good_gmm_dia_train_ids)

In [None]:
pred_good_train_paths=[]
for index, row in pred_good_gmm_dia_train_ids.iterrows():
    dAT_path = getPath(row['Fault ID'], 1)
    CFG_Path = getPath(row['Fault ID'], 2)
    pred_good_train_paths.append([CFG_Path, dAT_path])

In [None]:
pred_good_train_phases=[]
for index, row in pred_good_gmm_dia_train_ids.iterrows():
    pred_good_train_phases.append(row['Phase'])

In [None]:
len(pred_good_train_phases)

In [None]:
pred_good_datasets=data_frames_creation(pred_good_train_paths)

In [None]:
len(pred_good_datasets)

In [None]:
pred_fal_good_Visualization(pred_good_datasets,pred_good_train_phases,col_names)

In [None]:
false_predgood_train=[328,416]

In [None]:
pred_good_gmm_dia_train_reset=pred_good_gmm_dia_train.reset_index(drop=True)

In [None]:
false_predgood_trainset_=pred_good_gmm_dia_train_reset.iloc[false_predgood_train]

In [None]:
len(false_predgood_trainset_)

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(pred_good_gmm_dia_train.iloc[:,3],pred_good_gmm_dia_train.iloc[:,4], color='#B2FF33', s=70, alpha=0.8).set_title(' True Positive Vs False Positive Baes on GMM (Train set)',fontsize=13)
sns.scatterplot(false_predgood_trainset_.iloc[:,3],false_predgood_trainset_.iloc[:,4], color='#FF6E33', s=70, alpha=0.8)
#plt.legend()
plt.legend(labels=["True Positive","False Positive"])
#plt.savefig('good_pred')
plt.show()

#### Prediction on Test set

In [None]:
np.random.seed(35)
gmm_dia_test_labels=gmm_diag_train.predict(test_set)

In [None]:
len(gmm_dia_test_labels)

In [None]:
gmm_test_lab_df=pd.DataFrame(gmm_dia_test_labels,columns=['Test_labels'])

In [None]:
gmm_test_lab_df['Test_labels'].value_counts()

In [None]:
test_set_reset=test_set.reset_index(drop=True)
test_set_reset

In [None]:
print('Silhoutte score of GMM clustering (Test set) is ' + str(silhouette_score(test_set, gmm_dia_test_labels)))

In [None]:
gmm_dia_test_df = pd.concat([test_set_reset, gmm_test_lab_df], axis=1)

In [None]:
plt.figure(figsize = (17,7))
sns.scatterplot(gmm_dia_test_df.iloc[:,3],gmm_dia_test_df.iloc[:,4], hue=gmm_dia_test_df['Test_labels'], palette='magma', s=70, alpha=0.8).set_title('Distribution of cluster Based on Gaussian Mixture Model (Test set)',fontsize=13)
plt.legend()
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

###### `********************************************************************************************************************`

## Predicted Failure Verification by GMM Diagonal (Test Set)

In [None]:
pred_fal_gmm_dia_test=gmm_dia_test_df[gmm_dia_test_df['Test_labels']==1]

In [None]:
pred_fal_gmm_dia_test

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(pred_fal_gmm_dia_test.iloc[:,3],pred_fal_gmm_dia_test.iloc[:,4],color='#08BF4B' , s=70, alpha=0.8).set_title('Only Predicted Failure cluster by GMM on Test set',fontsize=13)
plt.legend(labels=['Predicted Failure Cluster Only (Test Set)'])
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

In [None]:
test_set_engin_df_reset=test_set_engin_df.reset_index(drop=True)
test_set_engin_df_reset

In [None]:
predicted_failure_gmm_index_test=pred_fal_gmm_dia_test.index

In [None]:
predicted_failure_gmm_index_test

In [None]:
Faulty_pred_gmm_ids_test=[]
for i in predicted_failure_gmm_index_test:
    Faulty_pred_gmm_ids_test.append(test_set_engin_df_reset.loc[i])
Faulty_pred_gmm_ids_test=pd.DataFrame(Faulty_pred_gmm_ids_test)

In [None]:
Faulty_pred_gmm_ids_test

In [None]:
gmm_dia_failure_paths_test=[]
for index, row in Faulty_pred_gmm_ids_test.iterrows():
    dAT_path = getPath(row['Fault ID'], 1)
    CFG_Path = getPath(row['Fault ID'], 2)
    gmm_dia_failure_paths_test.append([CFG_Path, dAT_path])

In [None]:
gmm_dia_faulty_phases_test=[]
for index, row in Faulty_pred_gmm_ids_test.iterrows():
    gmm_dia_faulty_phases_test.append(row['Phase'])

In [None]:
gmm_dia_fal_testdata=data_frames_creation(gmm_dia_failure_paths_test)

In [None]:
pred_fal_good_Visualization(gmm_dia_fal_testdata,gmm_dia_faulty_phases_test,col_names)

In [None]:
false_failure_testset=[0,8,92]

In [None]:
pred_fal_gmm_dia_test

In [None]:
pred_fal_gmm_dia_test_reset=pred_fal_gmm_dia_test.reset_index(drop=True)

In [None]:
false_neg_test=pred_fal_gmm_dia_test_reset.iloc[false_failure_testset]
false_neg_test

In [None]:
import seaborn as sns
plt.figure(figsize = (17,7))
sns.scatterplot(pred_fal_gmm_dia_test.iloc[:,3],pred_fal_gmm_dia_test.iloc[:,4],color='#B2FF33', s=70, alpha=0.8).set_title('Predicted Failure cluster based on GMM (Test set)',fontsize=13)
sns.scatterplot(false_neg_test.iloc[:,3],false_neg_test.iloc[:,4], color='#FF6E33' ,s=70, alpha=0.8)
#plt.legend()
plt.legend(labels=["True Failure","False Failure"])
#plt.savefig('failure_only')
plt.show()

## Predicted Good Verification (Test Set)

In [None]:
pred_good_gmm_dia_test=gmm_dia_test_df[gmm_dia_test_df['Test_labels']==0]

In [None]:
plt.figure(figsize = (17,7))
sns.scatterplot(pred_good_gmm_dia_test.iloc[:,3],pred_good_gmm_dia_test.iloc[:,4], color='#691381',  s=70, alpha=0.8).set_title(' Only Predicted Good cluster based GMM (Test Set)',fontsize=13)
plt.legend(labels=['Predicted Good Cluster Only'],loc='upper left')
#plt.savefig('Cluster_pCA_1_and_3')
plt.show()

In [None]:
pred_good_gmm_dia_test_index=pred_good_gmm_dia_test.index
pred_good_gmm_dia_test_index

In [None]:
pred_good_gmm_dia_test_ids=[]
for i in pred_good_gmm_dia_test_index:
    pred_good_gmm_dia_test_ids.append(test_set_engin_df_reset.loc[i])
pred_good_gmm_dia_test_ids=pd.DataFrame(pred_good_gmm_dia_test_ids)

In [None]:
pred_good_test_paths=[]
for index, row in pred_good_gmm_dia_test_ids.iterrows():
    dAT_path = getPath(row['Fault ID'], 1)
    CFG_Path = getPath(row['Fault ID'], 2)
    pred_good_test_paths.append([CFG_Path, dAT_path])

In [None]:
pred_good_test_phases=[]
for index, row in pred_good_gmm_dia_test_ids.iterrows():
    pred_good_test_phases.append(row['Phase'])

In [None]:
pred_goodtest_datasets=data_frames_creation(pred_good_test_paths)

In [None]:
pred_fal_good_Visualization(pred_goodtest_datasets,pred_good_test_phases,col_names)

In [None]:
# #NOFALSE PREDGOOD ON TESTSET
# false_predgood_test=[]

In [None]:
# pred_good_gmm_dia_test_reset=pred_good_gmm_dia_test.reset_index(drop=True)
# false_predgood_testset_=pred_good_gmm_dia_test_reset.iloc[false_predgood_test]

In [None]:
# false_predgood_testset_

In [None]:
plt.figure(figsize = (17,7))
sns.scatterplot(pred_good_gmm_dia_test.iloc[:,3],pred_good_gmm_dia_test.iloc[:,4], color='#B2FF33', s=70, alpha=0.8).set_title(' True Positive Vs False Positive Baesd on GMM (Test set)',fontsize=13)
# sns.scatterplot(false_predgood_testset_.iloc[:,0],false_predgood_testset_.iloc[:,1], color='#FF6E33', s=70, alpha=0.8)
#plt.legend()
plt.legend(labels=["True Positive"])
#plt.savefig('good_pred')
plt.show()

#### Evaluation Metrics on Train and test Set

###### Train Sets

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

In [None]:
Silhouette_score_trainset=silhouette_score(train_set,gmm_dia_train_labels)
calinski_harabasz_score_trainset=calinski_harabasz_score(train_set,gmm_dia_train_labels)
davies_bouldin_score_trainset=davies_bouldin_score(train_set,gmm_dia_train_labels)

In [None]:
print(Silhouette_score_trainset)
print(calinski_harabasz_score_trainset)
print(davies_bouldin_score_trainset)

###### Test Sets

In [None]:
Silhouette_score_testset=silhouette_score(test_set,gmm_dia_test_labels)
calinski_harabasz_score_testset=calinski_harabasz_score(test_set,gmm_dia_test_labels)
davies_bouldin_score_testset=davies_bouldin_score(test_set,gmm_dia_test_labels)

In [None]:
print(Silhouette_score_testset)
print(calinski_harabasz_score_testset)
print(davies_bouldin_score_testset)

In [None]:
Evaluation_df=pd.DataFrame({
    'Silhouette Score':[Silhouette_score_trainset,Silhouette_score_testset],
    'Davis Bouldin Index': [davies_bouldin_score_trainset,davies_bouldin_score_testset],
    'Calinski Harabasz Score':[calinski_harabasz_score_trainset,calinski_harabasz_score_testset]
},
index=['Gaussian Mixture Model (Train set)','Gaussian Mixture Model (Test set)'])

In [None]:
Evaluation_df

##### Evaluation GRAPHS

##### 50_50_SPLIT

In [None]:
random_states=[32,199,280,500,1000,2000]

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

In [None]:
train_set_len=round(len(ten_PCA_shuffle)*0.5)
test_set_len=round(len(ten_PCA_shuffle)*0.5)


In [None]:
print(train_set_len)

In [None]:
train_set_50=ten_PCA_shuffle.iloc[0:686,:]
test_set_50=ten_PCA_shuffle.iloc[686:,:]

In [None]:
sil_score_test_50=[]
calilslki_score_test_50=[]
DB_score_test_50=[]

In [None]:
DB_score_test_50

In [None]:
for i in random_states:
    gmm_diag_eval = GaussianMixture(n_components=2,covariance_type='diag',random_state=i).fit(train_set_50)   
    gmm_dia_labels_test=gmm_diag_eval.predict(test_set_50)
    sil_score_test_50.append(silhouette_score(test_set_50,gmm_dia_labels_test)) 
    calilslki_score_test_50.append(calinski_harabasz_score(test_set_50,gmm_dia_labels_test))
    DB_score_test_50.append(davies_bouldin_score(test_set_50,gmm_dia_labels_test))

In [None]:
df_50_test=pd.DataFrame({
    'sil_test':sil_score_test_50,
    'calin_test':calilslki_score_test_50,
    'db_test':DB_score_test_50,
})

In [None]:
df_50_test

#### SPLIT 60_40

In [None]:
train_set_len=round(len(ten_PCA_shuffle)*0.6)
test_set_len=round(len(ten_PCA_shuffle)*0.4)
print(train_set_len)

In [None]:
train_set_60=ten_PCA_shuffle.iloc[0:823,:]
test_set_40=ten_PCA_shuffle.iloc[823:,:]

In [None]:
sil_score_test_40=[]
calilslki_score_test_40=[]
DB_score_test_40=[]

In [None]:
for i in random_states:
    gmm_diag_eval = GaussianMixture(n_components=2,covariance_type='diag',random_state=i).fit(train_set_60)   
    gmm_dia_labels_test=gmm_diag_eval.predict(test_set_40)
    sil_score_test_40.append(silhouette_score(test_set_40,gmm_dia_labels_test)) 
    calilslki_score_test_40.append(calinski_harabasz_score(test_set_40,gmm_dia_labels_test))
    DB_score_test_40.append(davies_bouldin_score(test_set_40,gmm_dia_labels_test))

In [None]:
df_40_test=pd.DataFrame({
    'sil_test':sil_score_test_40,
    'calin_test':calilslki_score_test_40,
    'db_test':DB_score_test_40,
})

In [None]:
df_40_test

#### SPLIT 70_30

In [None]:
train_set_len=round(len(ten_PCA_shuffle)*0.7)
test_set_len=round(len(ten_PCA_shuffle)*0.3)
print(train_set_len)

In [None]:
train_set_70=ten_PCA_shuffle.iloc[0:960,:]
test_set_30=ten_PCA_shuffle.iloc[960:,:]

In [None]:
sil_score_test_30=[]
calilslki_score_test_30=[]
DB_score_test_30=[]

In [None]:
for i in random_states:
    gmm_diag_eval = GaussianMixture(n_components=2,covariance_type='diag',random_state=i).fit(train_set_70)   
    gmm_dia_labels_test=gmm_diag_eval.predict(test_set_30)
    sil_score_test_30.append(silhouette_score(test_set_30,gmm_dia_labels_test)) 
    calilslki_score_test_30.append(calinski_harabasz_score(test_set_30,gmm_dia_labels_test))
    DB_score_test_30.append(davies_bouldin_score(test_set_30,gmm_dia_labels_test))

In [None]:
df_30_test=pd.DataFrame({
    'sil_test':sil_score_test_30,
    'calin_test':calilslki_score_test_30,
    'db_test':DB_score_test_30,
})

In [None]:
df_30_test

##### 80_20 SPLIT

In [None]:
train_set_len=round(len(ten_PCA_shuffle)*0.8)
test_set_len=round(len(ten_PCA_shuffle)*0.2)
print(train_set_len)

In [None]:
train_set_80=ten_PCA_shuffle.iloc[0:1098,:]
test_set_20=ten_PCA_shuffle.iloc[1098:,:]

In [None]:
sil_score_test_20=[]
calilslki_score_test_20=[]
DB_score_test_20=[]

In [None]:
for i in random_states:
    gmm_diag_eval = GaussianMixture(n_components=2,covariance_type='diag',random_state=i).fit(train_set_80)   
    gmm_dia_labels_test=gmm_diag_eval.predict(test_set_20)
    sil_score_test_20.append(silhouette_score(test_set_20,gmm_dia_labels_test)) 
    calilslki_score_test_20.append(calinski_harabasz_score(test_set_20,gmm_dia_labels_test))
    DB_score_test_20.append(davies_bouldin_score(test_set_20,gmm_dia_labels_test))

In [None]:
df_20_test=pd.DataFrame({
    'sil_test':sil_score_test_20,
    'calin_test':calilslki_score_test_20,
    'db_test':DB_score_test_20,
})

In [None]:
df_20_test

In [None]:
ss_df_with_splits=pd.DataFrame({
    '50%_test':sil_score_test_50,
    '40%_test':sil_score_test_40,
    '30%_test':sil_score_test_30,
    '20%_test':sil_score_test_20
})

In [None]:
ss_df_with_splits

In [None]:
fig = plt.figure()
fig.suptitle('Silhouette Score Variation with differnt splits of data set')
plt.boxplot(ss_df_with_splits, labels = ss_df_with_splits.columns)
plt.show

In [None]:
mean = ss_df_with_splits.mean(axis =0)
# define the std of the accuracy score
std = ss_df_with_splits.std(axis=0)
#define the axis
x = ss_df_with_splits.columns
#fitting the error bars
fig = plt.figure()
# fig.suptitle('MLP Algorithm Accuracy Error Plot for 15% of the Dataset')
ax = fig.add_subplot (111)
plt.bar(x,mean,yerr=std, color = "#FC8532",ec = "white",ecolor = 'black',capsize =5)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['left'].set_visible(True)
plt.title('Silhouette Score Variation with Different Data Splits')
plt.xlabel('Data Splits')
plt.ylabel('Silhouette Score')
plt.savefig('SS_splits.png')
plt.show()

In [None]:
ch_df_with_splits=pd.DataFrame({
    '50%_test':calilslki_score_test_50,
    '40%_test':calilslki_score_test_40,
    '30%_test':calilslki_score_test_30,
    '20%_test':calilslki_score_test_20
})

In [None]:
ch_df_with_splits

In [None]:
mean = ch_df_with_splits.mean(axis =0)
# define the std of the accuracy score
std = ch_df_with_splits.std(axis=0)
#define the axis
x = ch_df_with_splits.columns
#fitting the error bars
fig = plt.figure()
# fig.suptitle('MLP Algorithm Accuracy Error Plot for 15% of the Dataset')
ax = fig.add_subplot (111)
plt.bar(x,mean,yerr=std, color = "#73B3B5",ec = "white",ecolor = 'black',capsize =5)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['left'].set_visible(True)
plt.title('Calinski Harabasz Index Variation with Different Data Splits')
plt.xlabel('Data Splits')
plt.ylabel('Calinski Harabasz Score')
plt.savefig('CH_splits.png')
plt.show()

In [None]:
db_df_with_splits=pd.DataFrame({
    '50%_test':DB_score_test_50,
    '40%_test':DB_score_test_40,
    '30%_test':DB_score_test_30,
    '20%_test':DB_score_test_20
})

In [None]:
mean = db_df_with_splits.mean(axis =0)
# define the std of the accuracy score
std = db_df_with_splits.std(axis=0)
#define the axis
x = db_df_with_splits.columns
#fitting the error bars
fig = plt.figure()
# fig.suptitle('MLP Algorithm Accuracy Error Plot for 15% of the Dataset')
ax = fig.add_subplot (111)
plt.bar(x,mean,yerr=std, color = "#6E9B81",ec = "white",ecolor = 'black',capsize =5)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['left'].set_visible(True)
plt.title('Davies Bouldin Index Variation with Different Data Splits')
plt.xlabel('Data Splits')
plt.ylabel('Davies Bouldin Index')
plt.savefig('DB_Splits.png')
plt.show()