# Experiment 26

- Min DF: 0.05
- Binary: True
- Vectorizer: Count
- Method: LDA
- Best k: 12

In [1]:
# Input
# from db import PythonProblems
import io

# DB
from questions.models import Solution, Cluster
import psycopg2

# Helpers
import numpy as np
import pickle
import base64

# Preprocessing
from tokenizer import create_bag_of_words
from sklearn.feature_extraction.text import CountVectorizer

# Learning
from clustering import Clustering
from analyzer import python_analyzer

In [39]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 597
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


# Connect to DB

In [3]:
connection = psycopg2.connect(user = "machineteaching",
                                  password = "***REMOVED***",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
connection.autocommit=True
cursor = connection.cursor()

In [4]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from experiments_solution where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()
    
    return document_topic

# Updating DB

In [5]:
# Get experiment conditions
exp_id = 26
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

document_topic = analyze(solutions, where_items, exp_id)
document_clusters = document_topic.argmax(axis=1)

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'solution_all', 12)


### Create clusters

In [25]:
clusters_def = {
    4: "String manipulation",
    6: "Math functions",
    8: "Conditional structure",
    10: "List loops",
    12: "Math and string loops"
}

for key,value in clusters_def.items():
    cluster = Cluster(id=key, label=value)
    cluster.save()

### Assign solutions to clusters

In [33]:
# Clear all clusters
for item in Solution.objects.filter(cluster__isnull=False):
    item.cluster=None
    item.save()

In [29]:
clusters_merge = {
    2: 4
}

for idx, doc_id in enumerate(docs_id):
    # Assigning docs to valid clusters
    if (document_clusters[idx]+1) in clusters_def.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(document_clusters[idx]+1))
        solution.cluster=cluster
#         solution.save()
    elif (document_clusters[idx]+1) in clusters_merge.keys():
        solution = Solution.objects.get(pk=doc_id)
        cluster = Cluster.objects.get(pk=(clusters_merge[document_clusters[idx]+1]))
        solution.cluster=cluster
#         solution.save()
    # Assign 2nd best value
    else:
        print("Solution %d from cluster %d was not assigned" % (doc_id, document_clusters[idx]+1))
        solution = Solution.objects.get(pk=doc_id)
        max_idx = np.argsort(document_topic[idx])[::-1]
        cluster = Cluster.objects.get(pk=(max_idx[1]+1))
        print("Assigning to 2nd best: %d" % cluster.pk)
        solution.cluster=cluster
        solution.save()

Solution 770 from cluster 1 was not assigned
Assigning to 2nd best: 6
Solution 772 from cluster 7 was not assigned
Assigning to 2nd best: 8
Solution 786 from cluster 1 was not assigned
Assigning to 2nd best: 6
Solution 806 from cluster 5 was not assigned
Assigning to 2nd best: 6
Solution 808 from cluster 3 was not assigned
Assigning to 2nd best: 6


Assigning new solutions 

In [164]:
def assign_to_cluster(exp_id, solutions, exercise_sol):
    where_items = get_where_items(exp_id)
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    _, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m)
    train_data_features = vectorizer.transform(exercise_sol)
    document_topic = model_db.transform(train_data_features)
    return document_topic

exercise_sol = ["""
def colisao(ret1x1,ret1y1,ret1x2,ret1y2,ret2x1,ret2y1,ret2x2,ret2y2):
    if ret1y1>ret2y1 and ret1y1>ret2y2 and ret1y2>ret2y1 and ret1y2>ret2y2:
        return False
    elif ret1x1>ret2x1 and ret1x1>ret2x2 and ret1x2>ret2x1 and ret1x2>ret2x1:
        return False
    elif ret1y1<ret2y1 and ret1y1<ret2y2 and ret1y2<ret2y1 and ret1y2<ret2y2:
        return False
    elif ret1x1<ret2x1 and ret1x1<ret2x2 and ret1x2<ret2x1 and ret1x2<ret2x1:
        return False
    else:
        return True
"""]



document_topic = assign_to_cluster(exp_id, solutions, exercise_sol)
print(document_topic)
# print(document_clusters)
max_idx = np.argsort(document_topic[0])[::-1]
for i in max_idx:
    print(i+1)

[[0.0026042  0.00260418 0.00260422 0.00260418 0.00260418 0.00260424
  0.00260427 0.97135381 0.00260417 0.00260421 0.00260417 0.00260419]]
8
7
6
3
10
1
12
4
2
5
11
9


In [147]:
def generate():
    def get_words():
        return ['aborto',
 'advirdes',
 'adviéreis',
 'alcatraz',
 'amoladura',
 'anticolérico',
 'arborista',
 'arpear',
 'atomismo',
 'barão',
 'bit',
 'boateiro',
 'borbulhento',
 'cabular',
 'cacheiro',
 'cagaçal',
 'caleira',
 'candelabro',
 'carcereiro',
 'centro',
 'cirzo',
 'cobiçoso',
 'confessor',
 'convêm',
 'crestar',
 'crápula',
 'cólico',
 'delirar',
 'delínquo',
 'destingir',
 'divindade',
 'dragagem',
 'drogaria',
 'duodecénio',
 'dúbio',
 'entretendo',
 'equitativo',
 'escorrer',
 'espaldar',
 'esticanço',
 'europeízem',
 'exultar',
 'famigerado',
 'fanático',
 'festinhas',
 'filonianos',
 'gauchai',
 'generante',
 'higienizar',
 'humildade',
 'imperturbado',
 'inclusão',
 'intercelular',
 'intravável',
 'isotónico',
 'Kennedy',
 'lamentar',
 'linguístico',
 'lixoso',
 'luso',
 'Lérida',
 'Mark',
 'matutar',
 'multimédia',
 'musicologia',
 'Mussolini',
 'nevoeirada',
 'ogivado',
 'OGMA',
 'paludismo',
 'panelada',
 'particularista',
 'penúltimo',
 'persecução',
 'pintalgar',
 'predilecção',
 'previne',
 'regresso',
 'repelir',
 'ressentir',
 'saberdes',
 'sagaz',
 'Schiller',
 'segue',
 'sigilar',
 'somatotropas',
 'sorrir',
 'subtractivo',
 'sustiveram',
 'sustivéreis',
 'séquito',
 'terreno',
 'trajo',
 'traziam',
 'troçar',
 'vagão',
 'virginalizar',
 'vivificativo',
 'voraz']
    tests = []
    words = get_words()
    random.shuffle(words)
    for word in words:
        i = random.randrange(0, len(word))
        multiply = random.choice([2,3])
        i = i*random.choice([1,multiply])
        tests.append([word, i])
    return tests

In [162]:
def generate():
    def calculate_colisao(ret1x1,ret1y1,ret1x2,ret1y2,ret2x1,ret2y1,ret2x2,ret2y2):
        if ret1y1>ret2y1 and ret1y1>ret2y2 and ret1y2>ret2y1 and ret1y2>ret2y2:
            return False
        elif ret1x1>ret2x1 and ret1x1>ret2x2 and ret1x2>ret2x1 and ret1x2>ret2x1:
            return False
        elif ret1y1<ret2y1 and ret1y1<ret2y2 and ret1y2<ret2y1 and ret1y2<ret2y2:
            return False
        elif ret1x1<ret2x1 and ret1x1<ret2x2 and ret1x2<ret2x1 and ret1x2<ret2x1:
            return False
        else:
            return True
    num_tests = 5
    tests = []
    while len(tests) < num_tests:
        r1x1 = random.randrange(9)
        r1y1 = random.randrange(9)
        r1x2 = random.randrange(10)
        r1y2 = random.randrange(10)
        while r1x2 <= r1x1:
            r1x2 = random.randrange(10)
        while r1y2 <= r1y1:
            r1y2 = random.randrange(10)
        r2x1 = random.randrange(9)
        r2y1 = random.randrange(9)
        r2x2 = random.randrange(10)
        r2y2 = random.randrange(10)
        while r2x2 <= r2x1:
            r2x2 = random.randrange(10)
        while r2y2 <= r2y1:
            r2y2 = random.randrange(10)
        resposta = calculate_colisao(r1x1, r1y1, r1x2, r1y2, r2x1, r2y1, r2x2, r2y2)
        
        if len(tests)%2 and resposta == True:
            continue
            
        elif not len(tests)%2 and resposta == False:
            continue
        test_case = [r1x1, r1y1, r1x2, r1y2, r2x1, r2y1, r2x2, r2y2]
        tests.append(test_case)
    return tests

In [163]:
generate()

[[5, 2, 9, 6, 8, 1, 9, 5],
 [8, 5, 9, 7, 1, 0, 6, 8],
 [7, 0, 9, 3, 2, 3, 9, 4],
 [0, 8, 2, 9, 3, 4, 6, 8],
 [7, 1, 8, 5, 6, 4, 8, 8]]

In [55]:
cases = generate()
for i in cases:
    i = i.replace('\n','')
    i = i.split(' ')
    new_i = []
    for elem in i:
        new_i.append(int(elem))
    print(new_i)

[5, 4]
[2, 1]
[9, 0]
[1, 0]
[3, 5]
[5, 0]
[3, 6]
[9, 2]
[4, 0]
[4, 8]
[7, 9]
[9, 2]
[8, 5]
[3, 8]
[5, 8]
[5, 7]
[9, 0]
[7, 7]
[2, 5]
[2, 5]
