In [None]:
# !pip install spacy-langdetect
# !pip install language-detector
# !pip install symspellpy
!pip install sentence-transformers
# !pip install umap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3

In [None]:
import os 
import pandas as pd
import numpy as np
import datetime
from gensim import corpora
import gensim
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

import keras
from keras.layers import Input, Bidirectional, LSTM, Dense, RepeatVector, Concatenate, Activation, Lambda, Dot, Softmax, TimeDistributed, Dropout, Layer
from keras.models import Model
from keras import backend as K
from sklearn.model_selection import train_test_split
import warnings
from nltk.tokenize import word_tokenize
warnings.filterwarnings('ignore')

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive

""" connect google drive """
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from collections import Counter
from sklearn.metrics import silhouette_score
# import umap
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gensim.models.coherencemodel import CoherenceModel
import numpy as np
import os

In [None]:
def get_topic_words(token_lists, labels, k=None):
  """
  get top words within each topic from clustering results
  """
  if k is None:
    k = len(np.unique(labels))
  topics = ['' for _ in range(k)]
  for i, c in enumerate(token_lists):
    topics[labels[i]] += (' ' + ' '.join(c))
  word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
  # get sorted word counts
  word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
  # get topics
  topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))

  return topics

def get_coherence(model, token_lists, measure='c_v'):
  """
  Get model coherence from gensim.models.coherencemodel
  :param model: Topic_Model object
  :param token_lists: token lists of docs
  :param topics: topics as top words
  :param measure: coherence metrics
  :return: coherence score
  """
  if model.method == 'LDA':
    cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus,dictionary=model.dictionary, coherence=measure)
  else:
    topics = get_topic_words(token_lists, model.cluster_model.labels_)
    cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary, coherence=measure)
    print(cm.get_coherence_per_topic())
  return cm.get_coherence()

def get_silhouette(model):
  """
  Get silhouette score from model
  :param model: Topic_Model object
  :return: silhouette score
  """
  if model.method == 'LDA':
    return
  lbs = model.cluster_model.labels_
  vec = model.vec[model.method]
  return silhouette_score(vec, lbs)

def get_wordcloud(model, token_lists, topic):
  """
  Get word cloud of each topic from fitted model
  :param model: Topic_Model object
  :param sentences: preprocessed sentences from docs
  """
  if model.method == 'LDA':
    return
  print('Getting wordcloud for topic {} ...'.format(topic))
  lbs = model.cluster_model.labels_
  tokens = ' '.join([' '.join(_) for _ in np.array(token_lists)[lbs == topic]])

  print(tokens)
  print()

In [None]:
from keras.models import Sequential

In [None]:
class Autoencoder:
  """
  Autoencoder for learning latent space representation
  architecture simplified for only one hidden layer
  """

  def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
    self.latent_dim = latent_dim
    self.activation = activation
    self.epochs = epochs
    self.batch_size = batch_size
    self.autoencoder = None
    self.encoder = None
    self.decoder = None
    self.his = None

  def _compile(self, input_dim):
    """
    compile the computational graph
    """
    input_vec = Input(shape=(input_dim,))
    encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)
    # print("encoded: ",encoded)
    decoded = Dense(input_dim, activation=self.activation)(encoded)
    # print("decoded: ",decoded)
    self.autoencoder = Model(input_vec, decoded, name='autoencoder')
    # print(self.autoencoder.summary())
    self.encoder = Model(input_vec, encoded, name='encoder')
    # print()
    # print(self.encoder.summary())
    encoded_input = Input(shape=(self.latent_dim,))
    decoder_layer = self.autoencoder.layers[-1]
    self.decoder = Model(encoded_input, self.autoencoder.layers[-1](encoded_input), name='decoder')
    # print()
    # print(self.decoder.summary())
    self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)

  def fit(self, X):
    if not self.autoencoder:
      print("input_dim形狀: ", X.shape)
      self._compile(X.shape[1])
    X_train, X_test = train_test_split(X)
    self.his = self.autoencoder.fit(X_train, X_train, epochs=200,
                     batch_size=128, shuffle=True,
                     validation_data=(X_test, X_test), verbose=0)

In [None]:
class Sampling(Layer):
  def __init__(self, latent_dim):
    super(Sampling, self).__init__()
    self.latent_dim = latent_dim

  def call(self, inputs):
    mu, log_var = inputs
    epsilon = K.random_normal(shape=(K.shape(mu)[0], self.latent_dim))
    return mu + K.exp(0.5 * log_var) * epsilon

In [None]:
class NTM_Autoencoder:
  def __init__(self, latent_dim=32, intermediate_dim = 96, activation='relu', epochs=200, batch_size=128, num_topics=15, dropout = 0.2):
    self.latent_dim = latent_dim
    self.intermediate_dim = intermediate_dim
    self.activation = activation
    self.epochs = epochs
    self.batch_size = batch_size
    self.dropout = dropout
    self.autoencoder = None
    self.encoder = None
    self.decoder = None
    self.his = None
    # self.num_topics = num_topics

  def _compile(self, input_dim):
    """
    compile the computational graph
    """

    input_vec = Input(shape=(input_dim,))
    encoded = Dense(self.intermediate_dim, activation=self.activation)(input_vec)
    encoded = Dropout(self.dropout)(encoded)
    mu = Dense(self.latent_dim)(encoded)
    log_var = Dense(self.latent_dim)(encoded)

    def sampling(args):
      z_mean, z_log_var = args
      epsilon = K.random_normal(shape=(K.shape(z_mean)[0], self.latent_dim), mean=0., stddev=1.)
      return z_mean + K.exp(z_log_var / 2) * epsilon

    # z = Sampling(self.latent_dim)([mu, log_var])
    z = Lambda(sampling)([mu, log_var])

    # Decoder
    decoder_input = Input(shape=(self.latent_dim,))
    x = Dense(self.intermediate_dim, activation=self.activation)(decoder_input)
    outputs = Dense(input_dim, activation=self.activation)(x)
    
    # decoded = Dense(self.latent_dim, activation=self.activation)(z)
    # outputs = Dropout(self.dropout)(decoded)
    
    self.encoder = Model(input_vec, [mu, log_var, z], name='encoder')
    # print(self.encoder.summary())
    # print()
    self.decoder = Model(decoder_input, outputs, name='decoder')
    # print(self.decoder.summary())
    # print()
    outputs = self.decoder(self.encoder(input_vec)[2])
    self.autoencoder = Model(input_vec, outputs, name='autoencoder')
    # print(self.autoencoder.summary())
    # print()
      
    def vae_loss(inputs, outputs):
      reconstruction_loss = K.mean(keras.losses.binary_crossentropy(inputs, outputs))
      kl_loss = -0.5 * K.sum(1 + log_var - K.square(mu) - K.exp(log_var))
      return K.mean(reconstruction_loss + kl_loss)

    # self.autoencoder.compile(optimizer='adam', loss=vae_loss)
    self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)
  
  def predict(self, x_test):
    return self.vae.predict(x_test)

  def fit(self, X):
    if not self.autoencoder:
      print("input_dim形狀: ", X.shape)
      self._compile(X.shape[1])
    X_train, X_test = train_test_split(X)
    self.his = self.autoencoder.fit(X_train, X_train, epochs=200,
                     batch_size=128, shuffle=True,
                     validation_data=(X_test, X_test), verbose=0)

In [None]:
# define model object
class Topic_Model:
  def __init__(self, k=10, method='TFIDF'):
    """
    :param k: number of topics
    :param method: method chosen for the topic model
    """
    if method not in {'TFIDF', 'LDA', 'BERT', 'LDA_BERT'}:
      raise Exception('Invalid method!')
    self.k = k
    self.dictionary = None
    self.corpus = None
    # self.stopwords = None
    self.cluster_model = None
    self.ldamodel = None
    self.vec = {}
    self.gamma = 15  # parameter for reletive importance of lda
    self.method = method
    self.AE = None
    self.id = method + '_' + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

  def vectorize(self, sentences, token_lists, method=None):
    """
    Get vecotr representations from selected methods
    """
    # Default method
    if method is None:
      method = self.method

    # turn tokenized documents into a id <-> term dictionary
    self.dictionary = corpora.Dictionary(token_lists)
    # convert tokenized documents into a document-term matrix
    self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

    if method == 'TFIDF':
      print('Getting vector representations for TF-IDF ...')
      tfidf = TfidfVectorizer()
      vec = tfidf.fit_transform(sentences)
      print('Getting vector representations for TF-IDF. Done!')
      return vec

    elif method == 'LDA':
      print('Getting vector representations for LDA ...')
      if not self.ldamodel:
        self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, 
                                id2word=self.dictionary, passes=20)

      def get_vec_lda(model, corpus, k):
        """
        Get the LDA vector representation (probabilistic topic assignments for all documents)
        :return: vec_lda with dimension: (n_doc * n_topic)
        """
        n_doc = len(corpus)
        vec_lda = np.zeros((n_doc, k))
        for i in range(n_doc):
          # get the distribution for the i-th document in corpus
          for topic, prob in model.get_document_topics(corpus[i]):
            vec_lda[i, topic] = prob

        return vec_lda

      vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
      print('Getting vector representations for LDA. Done!')
      return vec

    elif method == 'BERT':
      print('Getting vector representations for BERT ...')
      from sentence_transformers import SentenceTransformer
      model = SentenceTransformer('bert-base-nli-max-tokens')
      vec = np.array(model.encode(sentences, show_progress_bar=True))
      print('Getting vector representations for BERT. Done!')
      return vec

            
    elif method == 'LDA_BERT':
    #else:
      vec_lda = self.vectorize(sentences, token_lists, method='LDA')
      # print("vec_lda: ",vec_lda.shape) # (35116, 15)
      vec_bert = self.vectorize(sentences, token_lists, method='BERT')
      # print("vec_bert: ",vec_bert.shape) # (35116, 768)
      vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
      self.vec['LDA_BERT_FULL'] = vec_ldabert #[35116,783]
      if not self.AE:
        # self.AE = Autoencoder()
        self.AE = NTM_Autoencoder()
        print('Fitting Autoencoder ...')
        self.AE.fit(vec_ldabert)
        print('Fitting Autoencoder Done!')
      # vec = self.AE.encoder.predict(vec_ldabert)
      # return vec
      z_mean, _, _ = self.AE.encoder.predict(vec_ldabert)
      return z_mean

  def fit(self, sentences, token_lists, method=None, m_clustering=None):
    """
    Fit the topic model for selected method given the preprocessed data
    :docs: list of documents, each doc is preprocessed as tokens
    :return:
    """
    # Default method
    if method is None:
      method = self.method
    # Default clustering method
    if m_clustering is None:
      m_clustering = KMeans

    # turn tokenized documents into a id <-> term dictionary
    if not self.dictionary:
      self.dictionary = corpora.Dictionary(token_lists)
      # convert tokenized documents into a document-term matrix
      self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]

    ####################################################
    #### Getting ldamodel or vector representations ####
    ####################################################

    if method == 'LDA':
      if not self.ldamodel:
        print('Fitting LDA ...')
        self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, 
                                 id2word=self.dictionary, passes=20)
        print('Fitting LDA Done!')
    else:
      print('Clustering embeddings ...')
      self.cluster_model = m_clustering(self.k)
      self.vec[method] = self.vectorize(sentences, token_lists, method)

      self.cluster_model.fit(self.vec[method])
      print('Clustering embeddings. Done!')

  def predict(self, sentences, token_lists, out_of_sample=None):
    """
    Predict topics for new_documents
    """
    # Default as False
    out_of_sample = out_of_sample is not None

    if out_of_sample:
      corpus = [self.dictionary.doc2bow(text) for text in token_lists]
      if self.method != 'LDA':
        vec = self.vectorize(sentences, token_lists)
        print(vec)
    else:
      corpus = self.corpus
      vec = self.vec.get(self.method, None)

    if self.method == "LDA":
      lbs = np.array(list(map(lambda x: sorted(self.ldamodel.get_document_topics(x),
                            key=lambda x: x[1], reverse=True)[0][0], corpus)))
    else:
      lbs = self.cluster_model.predict(vec)
    return lbs

In [None]:
# path = "/content/gdrive/MyDrive/fucking_paper/dataset/phone_segement.csv"
path = "/content/gdrive/MyDrive/fucking_paper/dataset/segement.csv"

In [None]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()

In [None]:
def sentence_preprocess(review_list):
  tmp = False
  new_array = []
  new_review_list = []
  for sentence in review_list:
    sentence = sentence.lstrip().rstrip()
    sentence = sentence.split()
    # print(sentence)
    if len(sentence) == 1:
      # print("a")
      word = sentence[0]
      tmp = True
      new_array.append(word)
    else:
      if tmp == True:
        # print("b")
        word = " ".join(new_array)
        new_review_list.append(word)
        tmp = False
        new_array = []
      # print("c")
      word = " ".join(sentence)
      new_review_list.append(word)
  return new_review_list

In [None]:
# def main():
# method = "LDA_BERT"
method = "LDA"
ntopic = 15
cwd = os.getcwd() 
# filename = "phone_segement.csv"
# txtPath = os.path.join(cwd,'dataset','cellphone',filename) 
with open(path, 'r', encoding="utf-8",errors='ignore') as file:
  meta = pd.read_csv(file)
# print(meta.shape) #(24130, 16)
rws = meta.segement

sentences = []  # sentence level preprocessed
token_lists = []  # word level preprocessed
# idx_in = []  # index of sample selected

for i, review in enumerate(rws):
  review = review.replace("[","").replace("]","").replace("'","").split(",")
  new_review = sentence_preprocess(review)
  for sentence in new_review:
    sentences.append(sentence.lstrip().rstrip())
    # print(sentence.lstrip().rstrip())
    token_list = word_tokenize(sentence.lstrip().rstrip())
    token_lists.append(token_list)

# # print("token len: ",len(token_lists)) [ [w1,w2,w3..], [w1,w2], ...]
# print("sentences len: ",len(sentences)) # 35116個句子  # 275961個句子

tm = Topic_Model(k = ntopic, method = method)
# Fit the topic model by chosen method
tm.fit(sentences, token_lists)

print('Coherence:', get_coherence(tm, token_lists, 'c_v'))
print('Silhouette Score:', get_silhouette(tm))


Fitting LDA ...
Fitting LDA Done!
Coherence: 0.2873957365601844
Silhouette Score: None


In [None]:
# visualize and save img
# visualize(tm)
for i in range(tm.k):
  get_wordcloud(tm, token_lists, i)

In [None]:
# if __name__ == '__main__':
#   main()