In [1]:
import os
import re
import gc
import random
from copy import deepcopy

import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

from nltk.stem.wordnet import WordNetLemmatizer
from MulticoreTSNE import MulticoreTSNE as TSNE



In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/vagrant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
all_documents = pd.read_csv('all_documents.csv', header=0)
all_documents.head()

Unnamed: 0,group,text
0,alt.atheism,From: mathew mathew@mantis.co.uk Subject: Alt....
1,alt.atheism,From: mathew mathew@mantis.co.uk Subject: Alt....
2,alt.atheism,From: I3150101@dbstu1.rz.tubs.de Benedikt Rose...
3,alt.atheism,From: mathew mathew@mantis.co.uk Subject: Re: ...
4,alt.atheism,From: strom@Watson.Ibm.Com Rob Strom Subject: ...


In [5]:
def tokenize(text):
    text = text.lower()
    regexp = re.compile('[a-z\']+')
    word_list = regexp.findall(text)
    word_list = [word for word in word_list if len(word) >= 2]
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in word_list]

def get_colors(num_colors):
    cm = plt.get_cmap('tab20')
    return [cm(i / num_colors) for i in range(1, num_colors + 1)]
    
def plot_color_dots(tsne_result, labels):
    unique_labels = labels.unique()
    colors = get_colors(len(unique_labels))
    color_map = dict(zip(unique_labels, colors))
    color_map = labels.map(color_map).values
    
    plt.figure(figsize=(15, 10))
    plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=color_map)


def encode_groups(df):
    mapping = {group: index for index, group in enumerate(df['group'].unique())}
    df['group'] = df['group'].map(mapping)
    return df


def plot_through_tsne(vectors, labels):
    tsne = TSNE(n_jobs=2)
    result = tsne.fit_transform(vectors)
    plot_color_dots(result, labels)

    
def plot_contingency_matrix(y_true, y_pred):
    cm = metrics.cluster.contingency_matrix(y_true, y_pred)
    classes = list(range(20))
    
    fig, ax = plt.subplots(figsize=(18, 14))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    plt.show()

def hcv_report(y_true, y_pred):
    h, c, v = metrics.homogeneity_completeness_v_measure(y_true, y_pred)
    ami = 'AMI:'.ljust(20) + str(round(metrics.adjusted_mutual_info_score(y_true, y_pred), 3))
    ari = 'ARI:'.ljust(20) + str(round(metrics.adjusted_rand_score(y_true, y_pred), 3))
    h = 'Homogenity:'.ljust(20) + str(round(h, 3))
    c = 'Completeness:'.ljust(20) + str(round(c, 3))
    v = 'V-measure:'.ljust(20) + str(round(v, 3))
    print(ami, ari, h, c, v, sep='\n')

In [6]:
def wv_doc_2_vec(list_of_words, word2vec):
    result = deepcopy(word2vec.wv[list_of_words[0]])
    
    for word in list_of_words[1:]:
        try:
            result += word2vec.wv[word]
        except KeyError:
            pass
    return result / len(list_of_words)


def distance(v1, v2):
    return ((v1 - v2) ** 2).sum() ** 0.5

In [7]:
def get_all_combinations(grid):
    def recursive_search(grid, current):
        if grid:
            key = sorted(grid.keys())[0]
            res = []
            for value in grid[key]:
                new_grid = grid.copy()
                new_grid.pop(key)
                new_current = current.copy()
                new_current[key] = value
                res += recursive_search(new_grid, new_current)
            return res
        else:
            return [current]
    return recursive_search(grid, {})
        

def grid_search(X, y, estimator, grid, scorer=metrics.adjusted_mutual_info_score):
    param_sets = get_all_combinations(grid)
    best = None
    for param_set in param_sets:
        print('Scoring with params {} ...'.format(param_set))
        estimator.set_params(**param_set)
        this = (scorer(estimator.fit_predict(X), y), param_set, deepcopy(estimator))
        print('Score is {}'.format(round(this[0], 3)))
        if best is None:
            best = this
        else:
            if best[0] < this[0]:
                print('Updating best score since score of this model {} is greater than previous best score {}'.format(round(this[0], 2), round(best[0], 2)))
                best = this
    return best[2]

In [10]:
class Doc2VecTransformer:
    def __init__(self, vector_size=100, window=10, epochs=20, dm=1):
        self.params = {
            'vector_size': vector_size,
            'window': window,
            'epochs': epochs,
            'dm': dm,
            'workers': 2
        }
    
    def set_params(self, **kwargs):
        self.params.update(kwargs)
    
    def transform(self, X):
        doc_vectors = [doc2vec.docvecs[i] for i in range(len(tokenized_texts))]
        return np.array([list(item) for item in doc_vectors])
    
    def fit(self, X, *args, **kwargs):
        texts = [TaggedDocument(text, [i]) for i, text in enumerate(X)]
        self.doc2vec = Doc2Vec(texts, **self.params)
        return self

In [11]:
doc2wec = Doc2VecTransformer()
doc2wec.fit(all_documents['text'])
documents = doc2wec.transform(all_documents['text'])

KeyboardInterrupt: 