<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Clustering-based" data-toc-modified-id="Clustering-based-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Clustering based</a></span><ul class="toc-item"><li><span><a href="#modeling" data-toc-modified-id="modeling-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>modeling</a></span></li><li><span><a href="#prediction" data-toc-modified-id="prediction-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>prediction</a></span></li><li><span><a href="#evaluation" data-toc-modified-id="evaluation-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>evaluation</a></span></li></ul></li></ul></div>

In [1]:
from sklearn.metrics import pairwise_distances
from sklearn import metrics
from sklearn import mixture
from sklearn.cluster import KMeans
from nltk.cluster import KMeansClusterer, cosine_distance
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from matplotlib import pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from IPython.core.interactiveshell import InteractiveShell
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_curve, auc, precision_recall_curve
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
seeds = 1234
np.random.seed(seeds)

In [3]:
train = pd.read_json('../data/structured_train.json')
test = pd.read_json('../data/structured_test.json')

In [4]:
# train = train.groupby('label').sample(50, random_state=seeds)
# test = test.groupby('label').sample(50, random_state=seeds)

In [5]:
select_cols = ["global_index", "doc_path", "label", "reply", "reference_one", "reference_two",
               "Subject", "From", "Lines", "Organization", "contained_emails", "long_string", "text", "error_message"]
print("\nmay use cols: \n", select_cols)
train = train[select_cols]
test = test[select_cols]


may use cols: 
 ['global_index', 'doc_path', 'label', 'reply', 'reference_one', 'reference_two', 'Subject', 'From', 'Lines', 'Organization', 'contained_emails', 'long_string', 'text', 'error_message']


# Clustering based
- Steps:
    1. Transform into TF-IDF matrix
    2. Dimension reduction into 200
    3. Clustering in cosine similarity space (since it is word)
    4. Assign labels with majority vote based on training set labels
    5. Prediction
        1. Transform test set into TF-IDF matrix
        2. Dimension reduction into 200
        3. Make prediction based on the clusters and mapping between clusters and labels from training set
    6. Evaluation
        1. Based on classification report

## modeling

In [6]:
train_text = train['reply'] + ' ' + train['reference_one']
train_label = train['label']
test_text  = test['reply'] + ' ' + test['reference_one']
test_label = test['label']

In [7]:
from sklearn.decomposition import TruncatedSVD

def tfidf_vectorizer(train_text, test_text, min_df=3):
    tfidf_vect = TfidfVectorizer(stop_words="english", token_pattern=r'\b\w[\']?\w*\b',
                                 norm='l2', min_df=min_df, max_df=0.95)
    dtm_train = tfidf_vect.fit_transform(train_text)
    dtm_test = tfidf_vect.transform(test_text)

    word_to_idx = tfidf_vect.vocabulary_
    print("num of words:", len(word_to_idx))
    return dtm_train, dtm_test, word_to_idx, tfidf_vect


def dimension_reduction(dtm, out_dim=200, verbose=0):
    print("Dimension reduction with truncate SVD:")
    print("   input columns with ", dtm.shape[1])
    print("   output columns with ", out_dim)

    transform_mapper = TruncatedSVD(n_components=out_dim)
    dtm = transform_mapper.fit_transform(dtm)
    if verbose > 0:
        print("singular_values_: ", transform_mapper.singular_values_)
    return dtm, transform_mapper

In [8]:
def fit_clustering_model(dtm_train, train_label, num_clusters, metric='Cosine', model='KMeans', repeats=20):
    
    '''

    '''
    assert metric in ['Cosine']
    assert model in ['KMeans']

    # model training
    if model == 'KMeans':
        if metric == 'Cosine':
            clusterer = KMeansClusterer(num_clusters, cosine_distance, normalise=False, repeats=repeats, avoid_empty_clusters=True)
            clusters = clusterer.cluster(dtm_train, assign_clusters=True)
            train_cluster_pred = [clusterer.classify(v) for v in dtm_train]

    elif model == 'GMM':
        pass 
        # GMM model not good in such case
        # clusterer = mixture.GaussianMixture(n_components=num_clusters, n_init=repeats, covariance_type='diag')
        # clusterer.fit(dtm_train)
        # train_cluster_pred = clusterer.predict(dtm_train)
    
    # Maping clusters into labels
    df = pd.DataFrame(list(zip(train_label, train_cluster_pred)), columns=['actual_class', 'cluster'])
    confusion = pd.crosstab(index=df.cluster, columns=df.actual_class)
    clusters_to_labels = confusion.idxmax(axis=1)
    
    print("Cluster to label mapping: ")
    for idx, t in enumerate(clusters_to_labels):
        print("Cluster {} <-> label {}".format(idx, t))
    print("\n")

    return clusterer, clusters_to_labels

def pred_clustering_model(dtm_test, clusterer, clusters_to_labels):
    test_cluster_pred = [clusterer.classify(v) for v in dtm_test]
    predict = [clusters_to_labels[i] for i in test_cluster_pred]
    return predict

In [9]:
dtm_train, dtm_test, word_to_idx, tfidf_vect = tfidf_vectorizer(train_text, test_text, min_df=3)
dtm_train, transform_mapper = dimension_reduction(dtm_train, out_dim=200)
dtm_test = transform_mapper.transform(dtm_test)

print('dtm_train.shape', dtm_train.shape)
print('dtm_test.shape', dtm_test.shape)
print(word_to_idx)

num of words: 27653
Dimension reduction with truncate SVD:
   input columns with  27653
   output columns with  200
dtm_train.shape (11083, 200)
dtm_test.shape (7761, 200)


In [None]:
clusterer, clusters_to_labels = fit_clustering_model(dtm_train, train_label, num_clusters=100, repeats=2)

  return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


## prediction

In [None]:
pred = pred_clustering_model(dtm_test, clusterer, clusters_to_labels)

## evaluation

In [None]:
from sklearn import preprocessing
# le = preprocessing.LabelEncoder()
# encoded_test_label = le.fit_transform(test_label)
# print(metrics.classification_report(y_true = encoded_test_label, y_pred=pred, target_names=le.classes_))
print(metrics.classification_report(y_true = test_label, y_pred=pred))