In [1]:
#DB
import os

# Helpers
import numpy as np
# import qgrid
from tqdm import tqdm_notebook
import time
import pandas as pd

# Preprocessing
from analyzer import python_analyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from vectorizer import NCutVectorizer

# Plots
import matplotlib.pyplot as plt

%matplotlib notebook

## Data

In [5]:
def get_vocab():
    base_path = "bbc"
    with open("%s/bbc.terms" % base_path, "r") as txtfile:
        vocab = txtfile.read().split('\n')
    return vocab
    
def get_docs(topics):
    docs = []
    base_path = "bbc"
    for t in topics:
        docs_title = os.listdir("%s/%s" % (base_path,t))
        for item in docs_title:
            with open('%s/%s/%s' %(base_path, t, item), 'r') as txtfile:
                try:
                    docs.append(txtfile.read())
                except UnicodeDecodeError:
                    print("Error on doc %s/%s" % (t,item))
    return docs

In [7]:
# %load vectorizer.py
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from analyzer import python_analyzer
import numpy as np


class NCutVectorizer(object):
    def __init__(self, analyzer, binary, min_df, vocabulary):
        self.vectorizer = CountVectorizer(analyzer = 'word',
                                          binary=binary,
                                          min_df=min_df,
                                          vocabulary=vocabulary)

    def fit_transform(self, docs):
        train_data_features = self.vectorizer.fit_transform(docs)
        train_data_features = train_data_features.toarray()

        # Calculate NCut-weight
        doc_mat_norm = normalize(train_data_features)
        S = np.dot(doc_mat_norm.T, doc_mat_norm) + 0.001
        D = np.power(np.sum(S, axis=1), -0.5) * np.eye(S.shape[0])
        Y = np.dot(D, train_data_features.T)
        return Y.T


### Preprocessing solutions into bag of words ###

In [8]:
def create_bag_of_words(docs, vectorizer_method, binary=False, min_df=0.2):
    vocab = get_vocab()
#     vectorizer = vectorizer_method(analyzer = python_analyzer,
    vectorizer = vectorizer_method(analyzer = 'word',
                                   binary=binary,
                                   min_df=min_df,
                                   vocabulary=vocab) 
    train_data_features = vectorizer.fit_transform(docs)
    try:
        train_data_features = train_data_features.toarray()
    # It's already an array
    except AttributeError:
        pass
    return train_data_features

In [9]:
def run_clusters(*args):
    # Import libraries
    # DB
    import psycopg2
    # Helpers
    import base64
    import pickle
    import time
    import numpy as np
    # Learning
    from clustering import Clustering
    # Evaluation
    from sklearn.metrics import silhouette_samples, silhouette_score
    from gap import Gap
    from coherence import calculate_umass_coherence
    
    # Get arguments
    dataset, v, m, b, train_data_features, k, dist, method = args
    
    # Instanciate objects
    clustering = Clustering(train_data_features, k, metric=dist)
    gap = Gap(train_data_features, k, nrefs=20, distance=dist)
    
    # Cluster
    start = time.time()
    model, document_topic, word_topic = getattr(clustering, method)()
    clusters = document_topic.argmax(axis=1)
    end = time.time()
    clustering_time = end-start

    # Compute Gap
    start = time.time()
    k_gap = gap.calculate_gap(clustering, method)
#     time.sleep(10)
#     k_gap = [3,2]
    end = time.time()
    gap_time = end-start

    # Compute silhouette. Keep single to values to be able to plot it later
    start = time.time()
    k_silhouette = silhouette_score(train_data_features, clusters, metric=dist)
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(train_data_features, clusters, metric=dist)
    end = time.time()
    silhouette_time = end-start

    # UMass coherence
    start = time.time()
    k_coherence = calculate_umass_coherence(train_data_features, word_topic, clusters, k)
    end = time.time()
    coherence_time = end-start

    row = {
        "dataset": dataset,
        "X": train_data_features,
        "y": clusters,
        "vectorizer": v.__name__,
        "is_binary": b,
        "min_df": m,
        "distance": dist,
        "k": k,
        "method": method,
        "model": base64.b64encode(pickle.dumps(model)),
        "clustering_time": clustering_time,
        "gap": k_gap[0],
        "gap_std": k_gap[1],
        "gap_time": gap_time,
        "silhouette": k_silhouette,
        "silhouette_samples": sample_silhouette_values,
        "silhouette_time": silhouette_time,
        "coherence_samples": k_coherence[0],
        "coherence_med": k_coherence[1],
        "coherence_std": k_coherence[2],
        "coherence_time": coherence_time,
        "coherence_k": len(k_coherence[0]),
    }
    
    # Connect to DB
    connection = psycopg2.connect(user = "machineteaching",
                                  password = "",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
    connection.autocommit=True
    cursor = connection.cursor()

    # Write PSQL query
    insert_query_base = "INSERT INTO EXPERIMENTS "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    query = cursor.mogrify(insert_query, query_values)
    cursor.execute(query)
    
    return row

In [11]:
%%time

vectorizers = [
    CountVectorizer,
    TfidfVectorizer, 
    NCutVectorizer
]
min_df = np.arange(0.05, 0.5, 0.05)
# min_df = [0.05]
binary = [
    True, 
    False
]
cluster_methods = ['nmf']#, 'lda']
#, 'hierarchical', 'gaussian_mixture', 'spectral_clustering']
metric = [
    'euclidean', 
    'cosine', 
    'correlation'
]

topics = {}
topics["bbc5_vocab"] = ["business", "entertainment", "politics", "sport", "tech"]
topics["bbc4_vocab"] = ["business", "politics", "sport", "tech"]
topics["bbc3_vocab"] = ["politics", "sport", "tech"]
topics["bbc2_vocab"] = ["politics", "tech"]

# Testing how the metrics work for several group of topics
for dataset, topic_list in tqdm_notebook(topics.items(), desc="docs"):
    docs = get_docs(topic_list)
    
    # Grid search
    for v in tqdm_notebook(vectorizers, desc="vectorizer", leave=False):
        for m in tqdm_notebook(min_df, desc="min_df", leave=False):
            
            for b in tqdm_notebook(binary, desc="binary", leave=False):
                train_data_features = create_bag_of_words(docs, v, binary=b, min_df=m)

                # Remove rows containing only zeros (weird exercises)
                solution_sample = train_data_features[~(train_data_features==0).all(1)]
                if solution_sample.shape != train_data_features.shape:
                    error = {
                        "vectorizer": v,
                        "min_df": m,
                        "binary": b
                    }
                    print("ERROR: %s" % error)

                clusters = range(2, int(np.sqrt(min(train_data_features.shape)))+1)
                for k in tqdm_notebook(clusters, desc="clusters", leave=False):
                    for dist in tqdm_notebook(metric, desc="metric", leave=False):
                        for method in tqdm_notebook(cluster_methods, desc="method", leave=False):
                            # Sequences to be sent to map function
                            args = [dataset, v, m, b, train_data_features, 
                                    k, dist, method]
                            run_clusters(*args)
#                             break
#                         break
#                     break
#                 break
#             result = lbview.map_async(run_clusters, *args)
#             start = time.time()
#             jobs = 0
#             N = len(result)
#             while(not result.ready()):
#                 while result.progress == jobs:
#                     time.sleep(1)
#                     elapsed = time.time()
#                     print('\r', '%d/%d tasks finished after %d s' % (result.progress, N, (elapsed-start)), end='')
#                 os.system("echo %d/%d tasks finished after %d s >> log.txt" % (result.progress, N, (elapsed-start)))
#                 jobs += 1
#             speedup = "Speedup: %.2f x" % (1.0 * result.serial_time / result.wall_time)
#             os.system("echo %s >> log.txt" % speedup)

HBox(children=(IntProgress(value=0, description='docs', max=4, style=ProgressStyle(description_width='initial'…

Error on doc sport/199.txt


HBox(children=(IntProgress(value=0, description='vectorizer', max=3, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='min_df', max=9, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='binary', max=2, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='clusters', max=46, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='metric', max=3, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='method', max=1, style=ProgressStyle(description_width='initia…

OperationalError: SSL SYSCALL error: EOF detected


In [11]:
len(dataset_map)

14580