In [1]:
# Parallel processing
from multiprocessing import Pool, cpu_count

#DB
from questions.models import Solution, Cluster

# Helpers
import numpy as np
#from tqdm import tqdm_notebook
import time
import pandas as pd

# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from vectorizer import NCutVectorizer
from tokenizer import create_bag_of_words

# Learning
from clustering import Clustering

## Data

In [2]:
## Cleaning database
last_id = 132
# problems = Problem.objects.filter(id__gt=last_id)
# # solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
# print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be used: 132
Solutions to be used: 54
Got 54 documents


In [26]:
def run_clusters(*args):
    # Import libraries
    # DB
    import psycopg2
    # Helpers
    import base64
    import pickle
    import time
    import numpy as np
    # Learning
    from clustering import Clustering
    # Evaluation
    from sklearn.metrics import silhouette_samples, silhouette_score
#     from gap import Gap
    from coherence import calculate_umass_coherence
    round_results = []
    
    
    # Get arguments
    dataset, v, m, b, ng, train_data_features, k, dist, method = round_args
    round_results.append(round_args)

    # Instanciate objects
    clustering = Clustering(train_data_features, k, metric=dist)
#     gap = Gap(train_data_features, k, nrefs=20, distance=dist)

    # Cluster
    start = time.time()
    model, document_topic, word_topic = getattr(clustering, method)()
    clusters = document_topic.argmax(axis=1)
    end = time.time()
    clustering_time = end-start

    # Compute Gap
#     start = time.time()
#     k_gap = gap.calculate_gap(clustering, method)
#     end = time.time()
#     gap_time = end-start

#     # Compute silhouette. Keep single values to be able to plot it later
#     start = time.time()
#     try:
#         k_silhouette = silhouette_score(train_data_features, clusters, metric=dist)
#         # Compute the silhouette scores for each sample
#         sample_silhouette_values = silhouette_samples(train_data_features, clusters, metric=dist)
#     # Everything is assigned to one cluster
#     except ValueError:
#         k_silhouette = None
#         sample_silhouette_values = []
#     end = time.time()
#     silhouette_time = end-start

    # UMass coherence
    start = time.time()
    k_coherence5 = calculate_umass_coherence(train_data_features, word_topic, clusters, k, N=5)
    k_coherence10 = calculate_umass_coherence(train_data_features, word_topic, clusters, k, N=10)
    k_coherence15 = calculate_umass_coherence(train_data_features, word_topic, clusters, k, N=15)
    end = time.time()
    coherence_time = end-start

    row = {
        "dataset": dataset,
        "X": train_data_features,
        "y": clusters,
        "vectorizer": v.__name__,
        "is_binary": b,
        "min_df": m,
        "ngrams": list(ng),
        "token": train_data_features.shape[1],
        "distance": dist,
        "k": k,
        "method": method,
        "model": base64.b64encode(pickle.dumps(model)),
        "clustering_time": clustering_time,
#         "gap": k_gap[0],
#         "gap_std": k_gap[1],
#         "gap_time": gap_time,
#         "silhouette": k_silhouette,
#         "silhouette_samples": sample_silhouette_values,
#         "silhouette_time": silhouette_time,
        "coherence_samples5": k_coherence5[0],
        "coherence_samples10": k_coherence10[0],
        "coherence_samples15": k_coherence15[0],
        "coherence_med5": k_coherence5[1],
        "coherence_med10": k_coherence10[1],
        "coherence_med15": k_coherence15[1],
        "coherence_std5": k_coherence5[2],
        "coherence_std10": k_coherence10[2],
        "coherence_std15": k_coherence15[2],
        "coherence_time": coherence_time,
        "coherence_k5": len(k_coherence5[0]),
        "coherence_k10": len(k_coherence10[0]),
        "coherence_k15": len(k_coherence15[0]),
    }

        # Connect to DB
    #    connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
    #                                  password = settings.DATABASES["default"]["PASSWORD"],
    #                                  host = settings.DATABASES["default"]["HOST"],
    #                                  port = settings.DATABASES["default"]["PORT"],
    #                                  database = settings.DATABASES["default"]["NAME"])
    #    connection.autocommit=True
    #    cursor = connection.cursor()

    # Write PSQL query
    insert_query_base = "INSERT INTO EXPERIMENTS_2020_01_27 "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    return insert_query, query_values

IndentationError: unexpected indent (<ipython-input-26-45933b1c903c>, line 30)

In [None]:
 # Insert to DB
    print("connecting")
    connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                              password = settings.DATABASES["default"]["PASSWORD"],
                              host = settings.DATABASES["default"]["HOST"],
                              port = settings.DATABASES["default"]["PORT"],
                              database = settings.DATABASES["default"]["NAME"])
    print("connected")
    connection.autocommit=True
    cursor = connection.cursor()
    
    print("inserting %d results" % len(round_results))
    start = time.time()
    for insert_query, query_values in round_results:
        query = cursor.mogrify(insert_query, query_values)
        cursor.execute(query)
    end = time.time()
    print("inserted in %d seconds" % (end-start))
    connection.close()
    return 0

In [4]:
%%time

vectorizers = [
    CountVectorizer,
    TfidfVectorizer, 
    NCutVectorizer
]
ngrams = [
    (1,1), # unigrams
    (1,2), # unigrams + bigrams
    (1,3), # unigrams + bigrams + trigrams
    (2,2), # bigrams
    (2,3), # bigrams + trigrams
    (3,3), # trigrams
]
min_df = np.arange(0.05, 0.51, 0.05)
binary = [
    True,
    False
]
cluster_methods = [
    'nmf',
    'lda'
]
#, 'hierarchical', 'gaussian_mixture', 'spectral_clustering']
metric = [
    'euclidean', 
#     'cosine', 
#     'correlation'
]

# Sequences to be sent to map function
skipped = []
args = []

# Grid search
# Add jobs to list
for dataset in range(10):
    #for v in tqdm_notebook(vectorizers, desc="vectorizer", leave=False):
    for v in vectorizers:
        #for m in tqdm_notebook(min_df, desc="min_df", leave=False):
        for m in min_df:
            #for b in tqdm_notebook(binary, desc="binary", leave=False):
            for b in binary:
                #for ng in tqdm_notebook(ngrams, desc="ngrams", leave=False):
                for ng in ngrams:
                    try:
                        train_data_features,_,_ = create_bag_of_words(solutions, v, binary=b, min_df=m, 
                                                                      vectorizer_params={'ngram_range': ng})
                    except ValueError:
                        skipped.append((v,m,b,ng))
                        continue

                    # Remove rows containing only zeros (weird exercises)
                    # As we're using less tokens, some rows may be excluded
                    solution_sample = train_data_features[~(train_data_features==0).all(1)]
#                    if solution_sample.shape != train_data_features.shape:
#                        error = {
#                            "vectorizer": v,
#                            "min_df": m,
#                            "binary": b
#                        }
#                        print("ERROR: %s" % error)
        #             clusters = range(2, int(np.sqrt(min(train_data_features.shape)))+1)
                    clusters = range(2, 16)
                    #for k in tqdm_notebook(clusters, desc="clusters", leave=False):
                    for k in clusters:
#                        for dist in tqdm_notebook(metric, desc="metric", leave=False):
                        dist="euclidean"
                        #for method in tqdm_notebook(cluster_methods, desc="method", leave=False):
                        for method in cluster_methods:
                            arg_map = ('solution_2020_01_24_%2d' % dataset,
                                        v,
                                        m,
                                        b,
                                        ng,
                                        solution_sample,
                                        k,
                                        dist,
                                        method
                                      )
                            args.append(arg_map)

CPU times: user 10min 18s, sys: 37.6 s, total: 10min 56s
Wall time: 1min 38s


In [5]:
len(args)

97440

In [6]:
len(skipped)

120

In [7]:
%%time
#chunksize = len(args)//cpu_count()
#chunks = [args[x:x+chunksize] for x in range(0, len(args), chunksize)]
chunks = [args[x:x+25] for x in range(0, 1000, 25)]
len(chunks)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 62.7 µs


40

In [None]:
%%time
# Send jobs to workers
## DOES NOT WORK. I THINK IT HAS SOMETHING TO DO WITH NMF OR LDA FROM SCIKIT-LEARN. MY GUESS IS THAT 
# THEY ALLOCATE OR THEY NEED PROCESSORS AND THEY ARE NOT AVAILABLE.
with Pool(10) as pool:
    result = pool.starmap(run_clusters, chunks, chunksize=1)

starting
starting
starting
starting
starting
starting
starting
starting
starting
starting
starting


In [36]:
result[0]

[('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  2,
  'euclidean',
  'nmf'),
 ('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  2,
  'euclidean',
  'lda'),
 ('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],
 

In [33]:
result[1]

[('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  14,
  'euclidean',
  'lda'),
 ('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],
         [0, 0, 0, ..., 0, 0, 0]]),
  15,
  'euclidean',
  'nmf'),
 ('solution_2020_01_24_ 0',
  sklearn.feature_extraction.text.CountVectorizer,
  0.05,
  True,
  (1, 1),
  array([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 1, 0],

In [5]:
len(args)

16800