In [None]:
import os
import glob
import faiss
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import (StandardScaler, MinMaxScaler)
print("Tensorflow : ",tf.__version__)
print("Imported libraries")

In [None]:
path_df = ".../multiclass_classification_data.csv"
data = pd.read_csv(path_df)
data.loc[:,'Short_Text']  = data['Short_Text'].str.lower().str.strip()
data.loc[:,'ML_Category'] = data['ML_Category'].str.lower().str.strip()
data.loc[:,'data_type']   = data['data_type'].str.lower().str.strip()
data = data.dropna().drop_duplicates()
data = data.rename({"Short_Text":'text', 'ML_Category':'y'}, axis=1)
data = shuffle(data)
print(data.shape)
data.head()

In [None]:
train = data[data.data_type.isin(['train', 'test', 'val'])].reset_index(drop=True)
x_train, y_train = train[['text', 'y']],  train[['y']]
x_train = x_train.rename({"y":'y_true'}, axis=1)

test = data[data.data_type.isin(['val'])].reset_index(drop=True)
x_test, y_test = test[['text', 'y']], test[['y']]
x_test = x_test.rename({"y":'y_true'}, axis=1)
x_train.shape, x_test.shape

In [None]:
class FaissKMeans:
    def __init__(self, n_clusters=135, n_init=10, max_iter=100, n_redo=1, verbose=False, seed=None):
      self.n_clusters = n_clusters
      self.n_init = n_init
      self.max_iter = max_iter
      self.n_redo = n_redo
      self.verbose = verbose
      self.seed = seed
      self.kmeans = None
      self.cluster_centers_ = None
      self.inertia_ = None

    def sampling(self,which=None):
        pass
      
    def fit(self, X):
      '''
      nredo         : run the clustering this number of times, and keep the best centroids (selected according to clustering objective)
      verbose       : make clustering more verbose
      spherical     : perform spherical k-means -- the centroids are L2 normalized after each iteration
      int_centroids : round centroids coordinates to integer
      update_index  : re-train index after each iteration?
      min_points_per_centroid / max_points_per_centroid: below, you get a warning, above the training set is subsampled
      seed          : seed for the random number generator
      '''
      self.kmeans = faiss.Kmeans(d=X.shape[1],
                                 k=self.n_clusters,
                                 niter=self.max_iter,
                                 nredo=self.n_init,
                                 verbose=True, 
                                 gpu=True)
      self.kmeans.train(X.astype(np.float32))
      self.cluster_centers_ = self.kmeans.centroids
      self.inertia_ = self.kmeans.obj[-1]

    def get_best_cluster(self, X):
      '''
      .search(1): 1 use to fetch 1 nearest cluster
      '''
      return self.kmeans.index.search(X.astype(np.float32), 1)
      
    def iteration_stats(self):
      return self.kmeans.iteration_stats
    
    def inertia(self):
      return self.kmeans.obj
    
    def get_pred_labels(self, x_data, y_data):
      '''
      model  : FaissKMeans object
      x_data : numpy [batch,384] array
      y_data : pandas df have single column name y
      '''
      D, I = self.get_best_cluster(x_data)
      y_pred_cluster_mapping = {i:y_data.loc[np.argwhere(I.flatten()==i).flatten().flatten(),'y'].mode().values[0] for i in np.unique(I)}
      y_pred = np.array(list(map(lambda x: y_pred_cluster_mapping[x], I.flatten())))
      return y_pred

In [None]:
def scalers(train, test, scale=None):
  '''
  Train: Embedded train set
  Test : Embedded Test set
  Scale: None, minmax, standard
  '''
  if scale == 'minmax':
    scaler = MinMaxScaler()
    X_TRAIN_S = scaler.fit_transform(train)
    X_TEST_S  = scaler.transform(test)
  elif scale == 'standard':
    scaler = StandardScaler()
    X_TRAIN_S = scaler.fit_transform(train)
    X_TEST_S  = scaler.transform(test)
  else:
    scaler = None
    X_TRAIN_S = train
    X_TEST_S  = test
  return X_TRAIN_S, X_TEST_S, scaler


def which_embedding_model(name):
  ''' 
  name =
  tensorflow_use : Tensorflow Universal Sentense Embedding 512 dim.
  sbert          : 384 dim
  sbert_all      : 384 dim
  mpnet_all      : 768 dim
  '''  
  if name=='tensorflow_use': 
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
    sentense_model = hub.load(module_url)
    return lambda x: tf.squeeze(sentense_model([x]), axis=0)
  
  if name=='sbert': 
    sentense_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    return lambda x: sentense_model.encode(x)
  
  if name=='sbert_all': 
    sentense_model = SentenceTransformer('all-MiniLM-L6-v2')
    return lambda x: sentense_model.encode(x)
  
  if name=='mpnet_all': 
    sentense_model = SentenceTransformer('all-mpnet-base-v2')
    return lambda x: sentense_model.encode(x)


In [None]:
get_embeddings = which_embedding_model('mpnet_all')
x_train['emb'] = x_train['text'].apply(get_embeddings)
x_test['emb']  = x_test['text'].apply(get_embeddings)

X_TRAIN = np.array(x_train['emb'].tolist(),dtype=np.float16) # [instance, 512]
X_TEST = np.array(x_test['emb'].tolist(),dtype=np.float16) # [instance, 512]
print(X_TRAIN.shape, X_TEST.shape)

In [None]:
##########################
# For single cluster run #
##########################
all_models = []
k_range = [400, 510]
scales = None
X_TRAIN_S, X_TEST_S, scaler = scalers(X_TRAIN, X_TEST, scales)
for n_clusters in k_range:
  model = FaissKMeans(n_clusters=n_clusters, n_init=10, max_iter=100)
  model.fit(X_TRAIN_S)
  silhouette_scores = silhouette_score(X_TRAIN_S, model.get_best_cluster(X_TRAIN_S)[1].ravel())
  inertia          = model.kmeans.obj[-1]
  y_pred = model.get_pred_labels(X_TEST_S, y_test)
  acc = accuracy_score(x_test.y_true, y_pred)
  precision, recall, fscore, _ = precision_recall_fscore_support(x_test.y_true, y_pred, average='macro',labels=np.unique(x_test.y_true))
  all_models.append({'scale':str(scales), 'k':n_clusters, 'sil':silhouette_scores, 'iner':inertia,  
                     'prec':precision, 'recall':recall, "fscore":fscore, 'acc':acc})
  print("\tDone: ",n_clusters)

In [None]:
model_output = pd.DataFrame(all_models)
print(model_output[model_output['scale']=='None'].sort_values(['acc', 'fscore'], ascending=False).head(2))