# Experiment 26

- Min DF: 0.05
- Binary: True
- Vectorizer: Count
- Method: LDA
- Best k: 12

In [6]:
# Input
# from db import PythonProblems
import io

# DB
from questions.models import Solution, Cluster
import psycopg2

# Helpers
import numpy as np
import pickle
import base64

# Preprocessing
from tokenizer import create_bag_of_words
from sklearn.feature_extraction.text import CountVectorizer

# Learning
from clustering import Clustering
from analyzer import python_analyzer

In [7]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 610
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


# Connect to DB

In [8]:
connection = psycopg2.connect(user = "machineteaching",
                                  password = "",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
connection.autocommit=True
cursor = connection.cursor()

In [9]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from experiments_solution where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()
    
    return document_topic

# Updating DB

In [10]:
# Get experiment conditions
exp_id = 26
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

document_topic = analyze(solutions, where_items, exp_id)
document_clusters = document_topic.argmax(axis=1)

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'solution_all', 12)




### Create clusters

In [25]:
clusters_def = {
    4: "String manipulation",
    6: "Math functions",
    8: "Conditional structure",
    10: "List loops",
    12: "Math and string loops"
}

for key,value in clusters_def.items():
    cluster = Cluster(id=key, label=value)
    cluster.save()

### Assign solutions to clusters

In [33]:
# Clear all clusters
for item in Solution.objects.filter(cluster__isnull=False):
    item.cluster=None
    item.save()

In [29]:
clusters_merge = {
    2: 4
}

for idx, doc_id in enumerate(docs_id):
    # Assigning docs to valid clusters
    if (document_clusters[idx]+1) in clusters_def.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(document_clusters[idx]+1))
        solution.cluster=cluster
#         solution.save()
    elif (document_clusters[idx]+1) in clusters_merge.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(clusters_merge[document_clusters[idx]+1]))
        solution.cluster=cluster
#         solution.save()
    # Assign 2nd best value
    else:
        print("Solution %d from cluster %d was not assigned" % (doc_id, document_clusters[idx]+1))
        solution = Solution.objects.get(pk=doc_id)
        max_idx = np.argsort(document_topic[idx])[::-1]
        cluster = Cluster.objects.get(pk=(max_idx[1]+1))
        print("Assigning to 2nd best: %d" % cluster.pk)
        solution.cluster=cluster
        solution.save()

Solution 770 from cluster 1 was not assigned
Assigning to 2nd best: 6
Solution 772 from cluster 7 was not assigned
Assigning to 2nd best: 8
Solution 786 from cluster 1 was not assigned
Assigning to 2nd best: 6
Solution 806 from cluster 5 was not assigned
Assigning to 2nd best: 6
Solution 808 from cluster 3 was not assigned
Assigning to 2nd best: 6


Assigning new solutions 

In [161]:
def assign_to_cluster(exp_id, solutions, exercise_sol):
    where_items = get_where_items(exp_id)
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    _, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m)
    train_data_features = vectorizer.transform(exercise_sol)
    document_topic = model_db.transform(train_data_features)
    return document_topic

exercise_sol = ["""
def intercala(l1, l2):
    return [l1[0], l2[0], l1[1], l2[1], l1[2], l2[2]]
"""]



document_topic = assign_to_cluster(exp_id, solutions, exercise_sol)
print(document_topic)
# print(document_clusters)
max_idx = np.argsort(document_topic[0])[::-1]
for i in max_idx:
    print(i+1)

[[0.09901917 0.0064103  0.00641039 0.0064103  0.00641047 0.83687736
  0.00641032 0.00641038 0.00641026 0.00641034 0.00641026 0.00641046]]
6
1
5
12
3
8
10
7
4
2
11
9


In [159]:
def generate():
    import random
    num_tests = 10
    tests = []
    for test in range(num_tests):
        l1 = random.sample(range(10), 3)
        l2 = random.sample(range(10), 3)
        test_case = [l1, l2]
        tests.append(test_case)
    return tests

In [160]:
cases = generate()
cases

[[[7, 0, 3], [7, 4, 0]],
 [[3, 4, 0], [7, 6, 3]],
 [[9, 7, 8], [7, 2, 6]],
 [[1, 6, 9], [4, 3, 9]],
 [[5, 8, 9], [0, 9, 5]],
 [[1, 3, 9], [6, 2, 9]],
 [[7, 0, 3], [6, 0, 2]],
 [[5, 3, 9], [5, 8, 2]],
 [[1, 5, 3], [7, 4, 1]],
 [[8, 1, 2], [9, 3, 6]]]

In [135]:
def solve(tupla):
    strings = []
    numeros = []
    
    if type(tupla[0]) == str:
        strings.append(tupla[0])
    else:
        numeros.append(tupla[0])
        
    if type(tupla[1]) == str:
        strings.append(tupla[1])
    else:
        numeros.append(tupla[1])
        
    if type(tupla[2]) == str:
        strings.append(tupla[2])
    else:
        numeros.append(tupla[2])
        
    return strings, numeros
for dataset in cases:
    print(dataset)
    print(solve(*dataset))

[('humildade', 10, 1.7016920265272026)]
(['humildade'], [10, 1.7016920265272026])
[(8.607756810640138, 6, (4+5j))]
([], [8.607756810640138, 6, (4+5j)])
[('entretendo', (6+4j), 7)]
(['entretendo'], [(6+4j), 7])
