In [84]:
#DB
from questions.models import Solution, Cluster

# Helpers
import numpy as np
from tqdm import tqdm_notebook
import time
import pandas as pd

# Preprocessing
from analyzer import python_analyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vectorizer import NCutVectorizer

# Learning
from clustering import Clustering

## Data

In [91]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


### Preprocessing solutions into bag of words ###

In [98]:
def test_integrity(doc):
    file = io.StringIO(doc)
    try:
        for token in tokenize.generate_tokens(file.readline):
            continue
    except Exception:
        return False
    return True
        
def create_bag_of_words(docs, vectorizer_method, binary=False, min_df=0.2):
    for idx, d in enumerate(docs):
        if not test_integrity(d):
            print("error on %d" % idx)
            
    print("integrity ok")
            
    vectorizer = vectorizer_method(analyzer = python_analyzer,
                                   binary=binary,
                                   min_df=min_df) 
    train_data_features = vectorizer.fit_transform(docs)
    try:
        train_data_features = train_data_features.toarray()
    # It's already an array
    except AttributeError:
        pass
    return train_data_features

In [120]:
# %load coherence.py
from itertools import permutations
import numpy as np

def calculate_umass_coherence(X, word_topic, clusters, k, N=5):
    k_coherence = []
    for idx_cluster in range(k):
        count_data = X.copy()
        count_data[np.where(count_data != 0)] = 1
        cluster_data = count_data[clusters == idx_cluster]

        # If there aren't any documents assigned to the cluster, skip it
        if cluster_data.shape[0] == 0:
            continue

        # Calculate cooccurence matrix
        cluster_data[np.where(cluster_data > 1)] = 1
        cooccurence = np.dot(cluster_data.T, cluster_data)

        # For each topic, get N top words
        idx = word_topic[:,idx_cluster].argsort()[::-1][:N]
        perms = permutations(idx, 2)
        k_score = []
#         import pdb; pdb.set_trace()
        for i,j in perms:
            if cooccurence[i,i] == 0:
                continue
            score = np.log((cooccurence[i,j]+0.01)/cooccurence[i,i])
            k_score.append(score)
        k_topic = np.mean(np.asarray(k_score))
        k_coherence.append(k_topic)
    return k_coherence, np.median(k_coherence), np.std(k_coherence)


In [121]:
def run_clusters(*args):
    # Import libraries
    # DB
    import psycopg2
    # Helpers
    import base64
    import pickle
    import time
    import numpy as np
    # Learning
    from clustering import Clustering
    # Evaluation
    from sklearn.metrics import silhouette_samples, silhouette_score
    from gap import Gap
#     from coherence import calculate_umass_coherence
    
    # Get arguments
    dataset, v, m, b, train_data_features, k, dist, method = args
    
    # Instanciate objects
    clustering = Clustering(train_data_features, k, metric=dist)
    gap = Gap(train_data_features, k, nrefs=20, distance=dist)
    
    # Cluster
    start = time.time()
    model, document_topic, word_topic = getattr(clustering, method)()
    clusters = document_topic.argmax(axis=1)
    end = time.time()
    clustering_time = end-start

    # Compute Gap
    start = time.time()
    k_gap = gap.calculate_gap(clustering, method)
#     time.sleep(10)
#     k_gap = [3,2]
    end = time.time()
    gap_time = end-start

    # Compute silhouette. Keep single to values to be able to plot it later
    start = time.time()
    k_silhouette = silhouette_score(train_data_features, clusters, metric=dist)
    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(train_data_features, clusters, metric=dist)
    end = time.time()
    silhouette_time = end-start

    # UMass coherence
    start = time.time()
    k_coherence = calculate_umass_coherence(train_data_features, word_topic, clusters, k)
    end = time.time()
    coherence_time = end-start

    row = {
        "dataset": dataset,
        "X": train_data_features,
        "y": clusters,
        "vectorizer": v.__name__,
        "is_binary": b,
        "min_df": m,
        "distance": dist,
        "k": k,
        "method": method,
#         "model": base64.b64encode(pickle.dumps(model)),
        "clustering_time": clustering_time,
#         "gap": k_gap[0],
#         "gap_std": k_gap[1],
#         "gap_time": gap_time,
#         "silhouette": k_silhouette,
#         "silhouette_samples": sample_silhouette_values,
#         "silhouette_time": silhouette_time,
        "coherence_samples": k_coherence[0],
        "coherence_med": k_coherence[1],
        "coherence_std": k_coherence[2],
        "coherence_time": coherence_time,
        "coherence_k": len(k_coherence[0]),
    }
    
#     # Connect to DB
#     connection = psycopg2.connect(user = "machineteaching",
#                                   password = "***REMOVED***",
#                                   host = "localhost",
# #                                   port = "5432",
#                                   database = "machineteaching")
#     connection.autocommit=True
#     cursor = connection.cursor()

#     # Write PSQL query
#     insert_query_base = "INSERT INTO EXPERIMENTS "
#     column_value = []
#     insert_format = []
#     query_values = []
#     for col in row.keys():
#         if isinstance(row[col], np.ndarray):
#             query_values.append(row[col].tolist())
#         else:
#             query_values.append(row[col])
#         column_value.append(col)
#         insert_format.append("%s")

#     insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
#     insert_query += "(" + ", ".join(insert_format) + ")"
#     query_values = tuple(query_values)
#     query = cursor.mogrify(insert_query, query_values)
#     cursor.execute(query)
    
    return row

In [122]:
%%time

vectorizers = [
#     CountVectorizer,
    TfidfVectorizer, 
#     NCutVectorizer
]
# min_df = np.arange(0.05, 0.5, 0.05)
min_df = [0.05]
binary = [
#     True, 
    False
]
cluster_methods = [
    'nmf',
#     'lda',
]
#, 'hierarchical', 'gaussian_mixture', 'spectral_clustering']
metric = [
#     'euclidean', 
    'cosine', 
#     'correlation'
]

total = 0

# Grid search
for v in tqdm_notebook(vectorizers, desc="vectorizer", leave=False):
    for m in tqdm_notebook(min_df, desc="min_df", leave=False):

        for b in tqdm_notebook(binary, desc="binary", leave=False):
            train_data_features = create_bag_of_words(solutions, v, binary=b, min_df=m)

            # Remove rows containing only zeros (weird exercises)
            solution_sample = train_data_features[~(train_data_features==0).all(1)]
            if solution_sample.shape != train_data_features.shape:
                error = {
                    "vectorizer": v,
                    "min_df": m,
                    "binary": b
                }
                print("ERROR: %s" % error)

#             clusters = range(2, int(np.sqrt(min(train_data_features.shape)))+1)
            clusters = [3]
            for k in tqdm_notebook(clusters, desc="clusters", leave=False):
                for dist in tqdm_notebook(metric, desc="metric", leave=False):
                    for method in tqdm_notebook(cluster_methods, desc="method", leave=False):
                        # Sequences to be sent to map function
                        args = ['solution_all', v, m, b, train_data_features, 
                                k, dist, method]
                        total += 1
                        row = run_clusters(*args)

HBox(children=(IntProgress(value=0, description='vectorizer', max=1, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='min_df', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='binary', max=1, style=ProgressStyle(description_width='initia…

integrity ok


HBox(children=(IntProgress(value=0, description='clusters', max=1, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='metric', max=1, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='method', max=1, style=ProgressStyle(description_width='initia…

CPU times: user 2.42 s, sys: 7.73 s, total: 10.2 s
Wall time: 1.4 s


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  r = func(a, **kwargs)


In [123]:
row

{'dataset': 'solution_all',
 'X': array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.57220801,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]),
 'y': array([1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 0, 0, 2, 1, 0, 1, 0, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 2, 0, 2, 1]),
 'vectorizer': 'TfidfVectorizer',
 'is_binary': False,
 'min_df': 0.05,
 'distance': 'cosine',
 'k': 3,
 'method': 'nmf',
 'clustering_time': 0.02829909324645996,
 'coherence_samples'

In [96]:
total

6