# Experiment 26

- Min DF: 0.05
- Binary: True
- Vectorizer: Count
- Method: LDA
- Best k: 12

In [16]:
# Input
# from db import PythonProblems
import io

# DB
from questions.models import Solution, Cluster
import psycopg2

# Helpers
import numpy as np
import pickle
import base64

# Preprocessing
from tokenizer import create_bag_of_words
from sklearn.feature_extraction.text import CountVectorizer

# Learning
from clustering import Clustering
from analyzer import python_analyzer

In [18]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


# Connect to DB

In [4]:
connection = psycopg2.connect(user = "machineteaching",
                                  password = "***REMOVED***",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
connection.autocommit=True
cursor = connection.cursor()

In [5]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from experiments_solution where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()
    
    return document_topic

# Updating DB

In [27]:
# Get experiment conditions
exp_id = 26
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

document_topic = analyze(solutions, where_items, exp_id)
document_clusters = document_topic.argmax(axis=1)

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'solution_all', 12)




### Create clusters

In [29]:
clusters_def = {
    4: "String manipulation",
    6: "One-liners",
    8: "Conditional",
    10: "Array manipulation",
    12: "Loops and nested loops"
}

for key,value in clusters_def.items():
    cluster = Cluster(id=key, label=value)
    cluster.save()

### Assign solutions to clusters

In [33]:
# Clear all clusters
for item in Solution.objects.filter(cluster__isnull=False):
    item.cluster=None
    item.save()

In [37]:
clusters_merge = {
    2: 4
}

for idx, doc_id in enumerate(docs_id):
    # Assigning docs to valid clusters
    if (document_clusters[idx]+1) in clusters_def.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(document_clusters[idx]+1))
        solution.cluster=cluster
        solution.save()
    elif (document_clusters[idx]+1) in clusters_merge.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(clusters_merge[document_clusters[idx]+1]))
        solution.cluster=cluster
        solution.save()
    else:
        print("Solution %d from cluster %d was not assigned" % (doc_id, document_clusters[idx]+1))

Solution 770 from cluster 1 was not assigned
Solution 772 from cluster 7 was not assigned
Solution 786 from cluster 1 was not assigned
Solution 806 from cluster 5 was not assigned
Solution 808 from cluster 3 was not assigned
