<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clustering-based" data-toc-modified-id="Clustering-based-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clustering based</a></span><ul class="toc-item"><li><span><a href="#modeling" data-toc-modified-id="modeling-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>modeling</a></span></li></ul></li></ul></div>

In [1]:
import numpy as np 
from sklearn import metrics
from clustering_utils import *
# count_vectorizer, dimension_reduction'
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string


seeds = 2021
# np.random.seed(seeds)

import pandas as pd
train = pd.read_json('../data/structured_train.json')
test = pd.read_json('../data/structured_test.json')


# train = train.groupby('label').sample(50, random_state=seeds)
# test = test.groupby('label').sample(50, random_state=seeds)


select_cols = ["global_index", "doc_path", "label",
               "reply", "reference_one", "reference_two", "tag_reply", "tag_reference_one", "tag_reference_two",
               "Subject", "From", "Lines", "Organization", "contained_emails", "long_string", "text", "error_message"
               ]
print("\nmay use cols: \n", select_cols)
train = train[select_cols]
test = test[select_cols]


def normal_string(x):
    x = remove_stopwords(x)
    x = " ".join(preprocess_string(x))
    return x

# train_text = train['tag_reply']
# train_text = train['tag_reply']+ ' ' + train['tag_reference_one']
train_text = train['reply'] + ' ' + train['reference_one']
train_label = train['label']
train_text = train_text.apply(lambda x: normal_string(x))

# test_text  = test['tag_reply'] 
# test_text  = test['tag_reply'] + ' ' + test['tag_reference_one']
test_text  = test['reply'] + ' ' + test['reference_one']
test_label = test['label']
test_text = test_text.apply(lambda x: normal_string(x))


may use cols: 
 ['global_index', 'doc_path', 'label', 'reply', 'reference_one', 'reference_two', 'tag_reply', 'tag_reference_one', 'tag_reference_two', 'Subject', 'From', 'Lines', 'Organization', 'contained_emails', 'long_string', 'text', 'error_message']


# Clustering based
- Steps:
    1. Transform into TF-IDF matrix
    2. Dimension reduction into 200
    3. Clustering in cosine similarity space (since it is word)
    4. Assign labels with majority vote based on training set labels
    5. Prediction
        1. Transform test set into TF-IDF matrix
        2. Dimension reduction into 200
        3. Make prediction based on the clusters and mapping between clusters and labels from training set
    6. Evaluation
        1. Based on classification report
        
- Time complexity 
    - O(n^(dk+1)) where n is the number of observatons, d is the dimensionality, and k is k clusters

## modeling

In [2]:
dtm_train, dtm_test, word_to_idx, tfidf_vect = tfidf_vectorizer(train_text, test_text, min_df=2)
dtm_train, transform_mapper = dimension_reduction(dtm_train, out_dim=200)
dtm_test = transform_mapper.transform(dtm_test)

print('dtm_train.shape', dtm_train.shape)
print('dtm_test.shape', dtm_test.shape)
clusterer, clusters_to_labels = fit_clustering_model(dtm_train, train_label, num_clusters=50, metric='Cosine', repeats=2)
pred = pred_clustering_model(dtm_test, clusterer, clusters_to_labels)
print(metrics.classification_report(y_true = test_label, y_pred=pred))

num of words: 23563
Dimension reduction with truncate SVD:
   input columns with  23563
   output columns with  200
dtm_train.shape (11083, 200)
dtm_test.shape (7761, 200)


  return vector / sqrt(numpy.dot(vector, vector))


KeyboardInterrupt: 

In [5]:
dtm_train, dtm_test, word_to_idx, tfidf_vect = tfidf_vectorizer(train_text, test_text, min_df=2, max_df=0.99)
dtm_train, transform_mapper = dimension_reduction(dtm_train, out_dim=500)
dtm_test = transform_mapper.transform(dtm_test)

print('dtm_train.shape', dtm_train.shape)
print('dtm_test.shape', dtm_test.shape)
clusterer, clusters_to_labels = fit_clustering_model(dtm_train, train_label, num_clusters=50, metric='L2', repeats=20)
pred = pred_clustering_model(dtm_test, clusterer, clusters_to_labels)
print(metrics.classification_report(y_true = test_label, y_pred=pred))

num of words: 23563
Dimension reduction with truncate SVD:
   input columns with  23563
   output columns with  500
dtm_train.shape (11083, 500)
dtm_test.shape (7761, 500)
Group to label mapping: 
Group 0 <-> label talk.politics.guns
Group 1 <-> label alt.atheism
Group 2 <-> label rec.sport.baseball
Group 3 <-> label comp.os.ms-windows.misc
Group 4 <-> label rec.autos
Group 5 <-> label comp.graphics
Group 6 <-> label talk.politics.misc
Group 7 <-> label comp.sys.mac.hardware
Group 8 <-> label talk.politics.guns
Group 9 <-> label sci.crypt
Group 10 <-> label rec.sport.baseball
Group 11 <-> label comp.sys.ibm.pc.hardware
Group 12 <-> label sci.crypt
Group 13 <-> label comp.os.ms-windows.misc
Group 14 <-> label comp.sys.ibm.pc.hardware
Group 15 <-> label misc.forsale
Group 16 <-> label talk.politics.guns
Group 17 <-> label talk.politics.guns
Group 18 <-> label sci.space
Group 19 <-> label comp.sys.mac.hardware
Group 20 <-> label sci.crypt
Group 21 <-> label comp.graphics
Group 22 <-> labe

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
