# FateZ Clustering 

This notebook demonstrate how to implement clustering method with FateZ's representing method

In [1]:
import os
import torch
import numpy as np
from torch.utils.data import DataLoader
from pkg_resources import resource_filename
from sklearn import cluster
import fatez.test as test
import fatez.model as model
import scanpy as sc

# Ignoring warnings because of using LazyLinear
import warnings
warnings.filterwarnings('ignore')

### Initialize testing model first.

In [2]:
faker = test.Faker()
testM, _ = faker.test_full_model()
# model.Save(faker.test_gat(), '../data/ignore/gat.model')
# model.Save(testM, '../data/ignore/trainer.model')

Testing Full Model.

	Pre-Trainer Green.

	Fine-Tuner Green.

	Explainer Green.



### Get the fake dataset

In [3]:
dataset = faker.make_data_loader().dataset
for x, labels in DataLoader(dataset, batch_size = len(dataset)):
    all_fea_mat = x[0]
    all_adj_mat = x[1]
print(f'Labels:\n{labels.tolist()}')

Labels:
[1, 1, 3, 3, 3, 0, 2, 0, 1, 2]


### Process origin data

In [4]:
# Flatten Data
origin = np.array([torch.reshape(ele, (-1,)).tolist() for ele in all_fea_mat])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(origin, n_comps = 9, return_info = True,)
origin_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Origin Data Var Ratios:\n{var_ratios}')


Origin Data Var Ratios:
[0.23292051 0.16507269 0.15504806 0.11590058 0.1086289  0.08842403
 0.07736817 0.04511147 0.0115256 ]


### Process data with encoder

In [5]:
# Get encoded representaions made by GAT -> BERT encoder
encode = np.array([
    torch.reshape(ele, (-1,)).tolist() for ele in testM.get_encoder_output(
        all_fea_mat, all_adj_mat
    )
])

# PCA analysis for dimensionality deduction
pca_analysis = sc.pp.pca(encode, n_comps = 9, return_info = True,)
encode_pca = pca_analysis[0]
var_ratios = pca_analysis[2]
print(f'Encoded Rep Var Ratios:\n{var_ratios}')

Encoded Rep Var Ratios:
[7.24497723e-01 1.99142691e-01 7.31430602e-02 2.65173665e-03
 4.34263254e-04 1.27896569e-04 2.23717537e-06 3.91730420e-07
 0.00000000e+00]


### Set clustering models and fit models with original data

In [6]:
eps = 0.5
n_clusters = len(np.unique(labels))
min_samples = 5

dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)

dbscan.fit(origin_pca)
kmeans.fit(origin_pca)
optics.fit(origin_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[2 1 0 0 2 3 3 1 2 3]
[0 0 0 0 0 0 0 0 0 0]


### Reset models and fit with encoded representaions

In [7]:
dbscan = cluster.DBSCAN(eps = eps)
kmeans = cluster.KMeans(n_clusters = n_clusters)
optics = cluster.OPTICS(min_samples = min_samples)
dbscan.fit(encode_pca)
kmeans.fit(encode_pca)
optics.fit(encode_pca)

# Get labels
print(dbscan.labels_.astype(int))
print(kmeans.labels_.astype(int))
print(optics.labels_.astype(int))

[-1 -1  0  0  0  0  0 -1 -1 -1]
[1 1 0 0 0 0 0 2 0 3]
[0 0 0 0 0 0 0 0 0 0]
