In [2]:
#DB
from questions.models import Solution, Cluster
import psycopg2

# Helpers
import numpy as np
import pickle
import base64
import pandas as pd

# Preprocessing
import tokenize
from tokenizer import create_bag_of_words
from sklearn.feature_extraction.text import CountVectorizer
from vectorizer import NCutVectorizer

# Learning
from clustering import Clustering
from analyzer import python_analyzer

In [3]:
## Cleaning database
last_id = 132
problems = Problem.objects.filter(id__gt=last_id)
solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


# Connect to DB

In [4]:
connection = psycopg2.connect(user = "machineteaching",
                                  password = "***REMOVED***",
                                  host = "localhost",
#                                   port = "5432",
                                  database = "machineteaching")
connection.autocommit=True
cursor = connection.cursor()

In [12]:
def get_where_items(exp_id):
    cols = ["vectorizer", "min_df", "is_binary", "distance", "method", "dataset", "k", "model", "X"]
    query = "SELECT %s from experiments_solution where experiment_id = %s" % (", ".join(cols), exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def analyze(solutions, where_items, exp_id):
    v = eval(where_items[0][0])
    m = where_items[0][1]
    b = where_items[0][2]
    dist = where_items[0][3]
    method = where_items[0][4]
    k = where_items[0][6]
    model_db = pickle.loads(base64.b64decode(where_items[0][7]))
    X = np.asarray(where_items[0][8])
    print(k)

    train_data_features, vectorizer, feature_names = create_bag_of_words(solutions, v, binary=b, min_df=m)
    clustering = Clustering(train_data_features, k, metric=dist)
    clustering.seed = model_db.random_state
    
    model, document_topic, word_topic = getattr(clustering, method)()

    print("Count per class:")
    clusters = clustering.document_topic.argmax(axis=1)
    counts = np.unique(clusters, return_counts=True)
    print(counts)
    return clustering, method, feature_names, model

# Experiment 1463

- Min DF: 0.35
- Binary: True
- Vectorizer: NCut
- Method: NMF
- Best k: 7

In [32]:
# Get experiment conditions
exp_id = 1463
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

clustering, method, feature_names, model = analyze(solutions, where_items, exp_id)
clusters = clustering.document_topic.argmax(axis=1)
print("Topic 3")
topic3 = np.where(clusters == 2)
for item in np.asarray(solutions)[topic3]:
    print(item)
print("".join(["-"]*50))
print("Topic4")
topic4 = np.where(clusters == 3)
for item in np.asarray(solutions)[topic4]:
    print(item)

Conditions
('NCutVectorizer', 0.35, True, 'euclidean', 'nmf', 'solution_all', 7)
7
Count per class:
(array([0, 1, 2, 3]), array([ 5,  4, 26, 19]))
Topic 3
def list_ends(a_list):
    return [a_list[0], a_list[len(a_list)-1]]
def reverseWord(w):
  return ' '.join(w.split()[::-1])
def SumFunction(number1, number2):
	return number1+number2
def int2str(n):
	return str(n)
def sum_str(s1,s2):
	return int(s1)+int(s2)
def str_concat(s1,s2):
	return s1+s2
def palindrome(word):
    rvs = word[::-1]
    if word == rvs:
        return True
    else:
        return False
def reverse(word):
    x = ''
    for i in range(len(word)):
        x += word[len(word)-1-i]
    return x == word
def even(numlist):
    return [item for item in numlist if not item % 2]

def vote_and_retire(age):
    sentence = ""
    if age >= 16:
        sentence = "You are old enough to vote."
    else:
        ageToVote = 16 - age
        sentence = "You can vote in {0} years.".format(ageToVote)
         
    if



In [36]:
# Get experiment conditions
exp_id = 26
where_items = get_where_items(exp_id)
print("Conditions")
print(where_items[0][0:7])

clustering, method, feature_names, model = analyze(solutions, where_items, exp_id)
clusters = clustering.document_topic.argmax(axis=1)

for topic in [8,12,10,6,4]:
    print("Topic %s" % topic)
    topic_idx = np.where(clusters == topic-1)
    for item in np.asarray(solutions)[topic_idx]:
        print(item)
    print("".join(["-"]*50))

Conditions
('CountVectorizer', 0.05, True, 'euclidean', 'lda', 'solution_all', 12)
12
Count per class:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  9, 11]), array([ 2,  1,  1,  2,  1, 13,  1, 14,  7, 12]))
Topic 8
def max_of_three(a,b,c):
     max_3=0
     if a>b:
         #max_3=a
         if a>c:
             max_3=a
         else:
             max_3=c
     else:
          if b>c:
             max_3=b
          else:
             max_3=c
     return max_3
def palindrome(word):
    rvs = word[::-1]
    if word == rvs:
        return True
    else:
        return False
def is_prime(number):
    '''Returns True for prime numbers, False otherwise'''
    #Edge Cases
    if number == 1:
        prime = False
    elif number == 2:
        prime = True
    #All other primes    
    else:
        prime = True
        for check_number in range(2, int(number/2)+1):
            if number % check_number == 0:
                prime = False
                break
    return prime
def vote_and_retire(age

