# Workflow 3 - cluster the trajectory

**Input:** EnGen object featurized with reduced dimensionality (generated by Workflow2)


**Output:** Representative trajectory ensemble  
<hr>
Steps:

1. Import the featurized trajectory from Workflow2
2. Choose clustering technique: KMeans, Gaussian Mixture Models
3. Choose appropriate parameters for clustering (number of clusters)
4. Additionally filter trajectories
5. Extract the ensemble

In [None]:
#required imports

from engens.core.EnGens import EnGen
from engens.core.ClustEn import *
import pickle as pk

### Step 0 - Load data from Workflow2

In [None]:
engen = None
with open("wf2_resulting_EnGen.pickle", "rb") as file:
    engen = pk.load(file)

In [None]:
traj = engen.traj
ref = engen.ref
print("Using the trajectory {} and reference pdb file {}".format(traj, ref))

topology = engen.mdtrajref
print("The topology is:")
print(topology)

feat_dims = engen.dimred_data.shape[0]
print("The dimensionality of your featurization is {}".format(feat_dims))

feat = engen.featurizers[engen.chosen_feat_index]
print("You chose to featurize with")
print(feat.describe())

dimred_data = engen.dimred_data
print("After dimensionality reduction the dimension of your features is {}".format(dimred_data.shape[1]))


### Step 1 - choose the clustering method

In [None]:
#------------two clustering algorithms------------------#
#----------------choose and uncomment one----------------#
# Option 1 - choose Kmeans
'''
clustering = "KM"
cluster_method = clusterings[clustering](engen, n_rep=2)

# Option 2 - choose GMMs
'''
clustering = "GMM" 
cluster_method = clusterings[clustering](engen, n_rep=2)


### Step 2 - run the clustering with different parameters

In [None]:
# Create K clusters
if clustering =="KM":
    params = [{"n_clusters":i} for i in range(2, 10)]
    cluster_method.cluster_multiple_params(params)
else:
    params = [{"n_components":i} for i in range(2, 10)]
    cluster_method.cluster_multiple_params(params)

# analyze these parameters with the elbow method
cluster_method.analyze_elbow_method()

In [None]:
# analyze these parameters with the silhouette method
cluster_method.analyze_silhouette()

In [None]:
# pick the number of clusters
n = 3
cluster_method.choose_n(n)

### Step 3 - optionally pick a subset of clusters with the heighest weight

In [None]:
cluster_method.plot_cluster_weight()

In [None]:
cluster_method.choose_clusters(thr=0.007)

### Step 4 - extract conformations for the ensemble

In [None]:
# mode can be 
# 1 - "center" and the representatives will be the points closest to the center of the clusters
# 2 - "hub" and the representatives will be the points with the most neighbors

mode = "center"
cluster_method.choose_conformations(mode=mode)

In [None]:
cluster_method.chosen_frames

In [None]:
ensemble_location = "./res_ensemble"
cluster_method.extract_conformations(ensemble_location)


### Step 5 - save results for analysis

In [None]:
with open("wf3_resulting_EnGen.pickle", "wb") as file:
    pk.dump(engen, file, -1)
    
with open("wf3_resulting_Clust.pickle", "wb") as file:
    pk.dump(cluster_method, file, -1)