In [2]:
#DB
from questions.models import Solution, Cluster
import psycopg2

# Helpers
import numpy as np
import pickle
import base64
import pandas as pd

# Preprocessing
import tokenize
from tokenizer import create_bag_of_words
from sklearn.feature_extraction.text import CountVectorizer
from vectorizer import NCutVectorizer

# Learning
from clustering import Clustering
from analyzer import python_analyzer

In [3]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


# Connect to DB

In [39]:
connection = psycopg2.connect(user = "machineteaching",
                                  password = "",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
connection.autocommit=True
cursor = connection.cursor()

In [40]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from experiments_solution where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])
    print(k)

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()

    print("Count per class:")
    clusters = clustering.document_topic.argmax(axis=1)
    counts = np.unique(clusters, return_counts=True)
    print(counts)
    return clustering, method, feature_names, model

# Experiment 1463

- Min DF: 0.35
- Binary: True
- Vectorizer: NCut
- Method: NMF
- Best k: 7

In [45]:
# Get experiment conditions
exp_id = 1463
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

clustering, method, feature_names, model = analyze(solutions, where_items, exp_id)
clusters = clustering.document_topic.argmax(axis=1)

for topic in list(range(1,5)):
    print("Topic %s" % topic)
    topic_idx = np.where(clusters == topic-1)
    print(clustering.document_topic[topic_idx])
    for item in np.asarray(solutions)[topic_idx]:
        print(item)
    print("".join(["-"]*50))

Conditions
('NCutVectorizer', 0.35, True, 'euclidean', 'nmf', 'solution_all', 7)
7
Count per class:
(array([0, 1, 2, 3]), array([ 5,  4, 26, 19]))
Topic 1
[[0.30087489 0.03129181 0.29385347 0.05339903 0.02113867 0.19884436
  0.22235965]
 [0.25829213 0.2334272  0.2506256  0.05756596 0.20940767 0.15421292
  0.16550981]
 [0.27626643 0.         0.04972322 0.27136952 0.19570072 0.13681065
  0.21572302]
 [0.27626643 0.         0.04972322 0.27136952 0.19570072 0.13681065
  0.21572302]
 [0.27167922 0.03504064 0.18950703 0.11527518 0.19935412 0.1898193
  0.21632854]]
def hex2dec(hexString):
    denary = 0
    lengthHex = len(hexString)
    for element in range(lengthHex):
        hexSeg = hexString[element]
        if hexSeg == 'A':
            hexSeg = 10
        elif hexSeg == 'B':
            hexSeg = 11
        elif hexSeg == 'C':
            hexSeg = 12
        elif hexSeg == 'D':
            hexSeg = 13
        elif hexSeg == 'E':
            hexSeg = 14
        elif hexSeg



In [44]:
# Get experiment conditions
exp_id = 26
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

clustering, method, feature_names, model = analyze(solutions, where_items, exp_id)
clusters = clustering.document_topic.argmax(axis=1)

# for topic in [8,12,10,6,4]:
for topic in range(1,13):
    print("Topic %s" % topic)
    topic_idx = np.where(clusters == topic-1)
    for item in np.asarray(solutions)[topic_idx]:
        print(item)
    print("".join(["-"]*50))

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'solution_all', 12)
12
Count per class:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 11]), array([ 2,  1,  1,  2,  1, 13,  1, 14,  7, 12]))
Topic 1
def gen_fib(number):
    i = 1
    if number == 0:
        fib = []
    elif number == 1:
        fib = [1]
    elif number == 2:
        fib = [1,1]
    elif number > 2:
        fib = [1,1]
        while i < (number - 1):
            fib.append(fib[i] + fib[i-1])
            i += 1

    return fib
def bin2hex(binaryString):
    hexA = binaryString[:4]
    hexB = binaryString[4:]
    finalhex = ""
    for eachSegment in (hexA, hexB):
        denary = int(eachSegment[0])*8 + int(eachSegment[1])*4 + int(eachSegment[2])*2 + int(eachSegment[3])*1
        if denary == 10:
            denary = 'A'
        elif denary == 11:
            denary = 'B'
        elif denary == 12:
            denary = 'C'
        elif denary == 13:
            denary = 'D'
        elif denary =

