<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clustering-based" data-toc-modified-id="Clustering-based-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clustering based</a></span><ul class="toc-item"><li><span><a href="#modeling" data-toc-modified-id="modeling-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>modeling</a></span></li></ul></li><li><span><a href="#LDA-based" data-toc-modified-id="LDA-based-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>LDA based</a></span><ul class="toc-item"><li><span><a href="#Kmeans-clustering-based-on-LDA-topic-distribution-representation" data-toc-modified-id="Kmeans-clustering-based-on-LDA-topic-distribution-representation-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Kmeans clustering based on LDA topic distribution representation</a></span></li></ul></li></ul></div>

# Clustering based
- Steps:
    1. Transform into TF-IDF matrix
    2. Dimension reduction into 200
    3. Clustering in cosine similarity space (since it is word)
    4. Assign labels with majority vote based on training set labels
    5. Prediction
        1. Transform test set into TF-IDF matrix
        2. Dimension reduction into 200
        3. Make prediction based on the clusters and mapping between clusters and labels from training set
    6. Evaluation
        1. Based on classification report

## modeling

In [None]:
import numpy as np 
from sklearn import metrics
from clustering_utils import *
# count_vectorizer, dimension_reduction
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string


seeds = 2021
# np.random.seed(seeds)

import pandas as pd
train = pd.read_json('../data/structured_train.json')
test = pd.read_json('../data/structured_test.json')


train = train.groupby('label').sample(50, random_state=seeds)
test = test.groupby('label').sample(50, random_state=seeds)


select_cols = ["global_index", "doc_path", "label",
               "reply", "reference_one", "reference_two", "tag_reply", "tag_reference_one", "tag_reference_two",
               "Subject", "From", "Lines", "Organization", "contained_emails", "long_string", "text", "error_message"
               ]
print("\nmay use cols: \n", select_cols)
train = train[select_cols]
test = test[select_cols]


def normal_string(x):
    x = remove_stopwords(x)
    x = " ".join(preprocess_string(x))
    return x

# train_text = train['tag_reply']
# train_text = train['tag_reply']+ ' ' + train['tag_reference_one']
train_text = train['reply'] + ' ' + train['reference_one']
train_label = train['label']
train_text = train_text.apply(lambda x: normal_string(x))

# test_text  = test['tag_reply'] 
# test_text  = test['tag_reply'] + ' ' + test['tag_reference_one']
test_text  = test['reply'] + ' ' + test['reference_one']
test_label = test['label']
test_text = test_text.apply(lambda x: normal_string(x))

# LDA based 

In [54]:
num_topics = 100
lda, voc = fit_topic_model(train_text, num_topics=num_topics, save_name='gensim_lda_model')
# lda = load_gensim_LDA_model(save_name='lda_gensim_model')

In [59]:
# vis_lda = visualize_LDA_model(train_text, voc, lda)
# vis_lda

In [56]:
train_pred_group, train_topic_distribution = pred_topic_model(lda, train_text, vocabulary=voc)
group_to_label = link_group_to_label(train_label, train_pred_group)

test_pred_group, test_topic_distribution = pred_topic_model(lda, test_text, vocabulary=voc)
test_pred = test_pred_group.apply(lambda group: group_to_label[group])

Series([], dtype: float64)
Group to label mapping: 
Group 0 <-> label sci.electronics
Group 1 <-> label rec.sport.baseball
Group 2 <-> label sci.med
Group 3 <-> label sci.space
Group 4 <-> label rec.motorcycles
Group 5 <-> label sci.crypt
Group 6 <-> label sci.crypt
Group 7 <-> label talk.politics.mideast
Group 8 <-> label comp.graphics
Group 9 <-> label comp.sys.mac.hardware
Group 10 <-> label sci.space
Group 11 <-> label comp.graphics
Group 12 <-> label comp.sys.ibm.pc.hardware
Group 13 <-> label sci.electronics
Group 14 <-> label comp.os.ms-windows.misc
Group 15 <-> label talk.politics.guns
Group 16 <-> label comp.sys.mac.hardware
Group 17 <-> label comp.graphics
Group 18 <-> label comp.graphics
Group 19 <-> label misc.forsale
Group 20 <-> label comp.os.ms-windows.misc
Group 21 <-> label alt.atheism
Group 22 <-> label rec.motorcycles
Group 23 <-> label comp.os.ms-windows.misc
Group 24 <-> label comp.os.ms-windows.misc
Group 25 <-> label talk.politics.misc
Group 26 <-> label rec.moto

In [57]:
print(metrics.classification_report(y_true = test_label, y_pred=test_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.06      0.02      0.03       319
           comp.graphics       0.25      0.05      0.09       389
 comp.os.ms-windows.misc       0.12      0.04      0.06       394
comp.sys.ibm.pc.hardware       0.19      0.08      0.11       392
   comp.sys.mac.hardware       0.18      0.36      0.24       385
          comp.windows.x       0.26      0.25      0.26       395
            misc.forsale       0.32      0.15      0.21       390
               rec.autos       0.35      0.07      0.12       395
         rec.motorcycles       0.15      0.27      0.20       398
      rec.sport.baseball       0.18      0.32      0.23       397
        rec.sport.hockey       0.00      0.00      0.00       827
               sci.crypt       0.16      0.39      0.22       396
         sci.electronics       0.16      0.12      0.14       393
                 sci.med       0.04      0.07      0.05       198
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Kmeans clustering based on LDA topic distribution representation

In [58]:
dtm_train, transform_mapper = dimension_reduction(train_topic_distribution, out_dim=2)
dtm_test = transform_mapper.transform(test_topic_distribution)

clusterer, clusters_to_labels = fit_clustering_model(dtm_train, train_label, num_clusters=2, metric='Cosine', repeats=10)
pred = pred_clustering_model(dtm_test, clusterer, clusters_to_labels)
print(metrics.classification_report(y_true = test_label, y_pred=pred))

Dimension reduction with truncate SVD:
   input columns with  100
   output columns with  2
Group to label mapping: 
Group 0 <-> label soc.religion.christian
Group 1 <-> label comp.sys.ibm.pc.hardware


                          precision    recall  f1-score   support

             alt.atheism       0.00      0.00      0.00       319
           comp.graphics       0.00      0.00      0.00       389
 comp.os.ms-windows.misc       0.00      0.00      0.00       394
comp.sys.ibm.pc.hardware       0.09      0.96      0.16       392
   comp.sys.mac.hardware       0.00      0.00      0.00       385
          comp.windows.x       0.00      0.00      0.00       395
            misc.forsale       0.00      0.00      0.00       390
               rec.autos       0.00      0.00      0.00       395
         rec.motorcycles       0.00      0.00      0.00       398
      rec.sport.baseball       0.00      0.00      0.00       397
        rec.sport.hockey       0.00      0.00      0.00       827
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
