In [1]:
# Helpers
import pickle
import numpy as np
import pandas as pd
import base64
from collections import defaultdict
from itertools import chain

# DB 
import psycopg2
from django.conf import settings

# Learning
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from skbio.stats.distance import anosim
from skbio import DistanceMatrix

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import analyzer
from tokenizer import create_bag_of_words
from vectorizer import NCutVectorizer

### Get solutions from DB

In [2]:
## Cleaning database
last_id = 132
# problems = Problem.objects.filter(id__gt=last_id)
# # solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
# print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []
clusters = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)
    clusters.append(sol.cluster.id)

print("Got %d documents" %(solutions_obj.count()))

Problems to be used: 132
Solutions to be used: 54
Got 54 documents


In [3]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [4]:
def get_where_items(exp_id, cols, table):
    query = "SELECT %s from %s where experiment_id = %s" % (", ".join(cols), table, exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def get_original_q_matrix():
    # Get voted concepts per solution
    concepts = SolutionConcept.objects.all()
    MIN_THRESHOLD = 0.5
    agreed_concepts = defaultdict(list)
    agreed_concepts_len = {}
    
    # Count concept agreement per solution
    count_per_solution = dict(concepts.values_list('solution__id').annotate(count=Count('user', distinct=True)))

    # Filter out the concepts that didn't have agreement (50% of evaluators voted for it)
    for solution_id in docs_id:
        max_votes = count_per_solution[solution_id]
        concepts_per_solution = dict(concepts.filter(solution__id=solution_id).values_list('concept').annotate(
            count=Count("concept")))
        for concept, value in concepts_per_solution.items():
            if value >= (max_votes * MIN_THRESHOLD):
                agreed_concepts[solution_id].append(concept)
        agreed_concepts_len[solution_id] = len(agreed_concepts[solution_id])
        agreed_concepts_all = list(chain.from_iterable(agreed_concepts.values()))
        agreed_concepts_set = set(agreed_concepts_all)
        
    concept_idx = np.asarray(list(agreed_concepts_set))
    q_matrix = np.zeros((len(docs_id), len(concept_idx)))

    for q_idx, question_id in enumerate(docs_id):
        used_concepts = agreed_concepts[question_id]
        q_matrix[q_idx, np.where(np.isin(concept_idx, used_concepts))] = 1
    return q_matrix

def transform_data(q_matrix, q_matrix_hat):
    data = {}
    # Calculate similarities among questions in Q-Matrix and estimated Q-Matrix
    data['question_similarity'] = cosine_similarity(q_matrix)
    data['question_distance'] = cosine_distances(q_matrix)
    data['question_hat_similarity'] = cosine_similarity(q_matrix_hat)
    data['question_hat_distance'] = cosine_distances(q_matrix_hat)
    error = data['question_similarity'] - data['question_hat_similarity']

    # Calculate total error, RMSE and CMD
    data['error'] = np.sqrt(np.sum(np.power(error,2)))
    data['rmse'] = np.sqrt(np.mean(np.power(error,2)))
    qs = data['question_similarity']
    qs_hat = data['question_hat_similarity']
    data['cmd'] = 1-np.trace(np.dot(qs,qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
    return data

def calculate_anosim(item):
    row = {}
#     row['Experiment ID'] = item['exp_id']
#     row['Method'] = "%s (%d attempts)" % (item['method'].upper(), item['attempts'])
#     row['Concept'] = item['concepts']
    dm = DistanceMatrix(item['question_distance'])
    stats_list = []
    p_values = []
    for i in range(2, 20):
        model = AgglomerativeClustering(n_clusters=i, 
#                                         affinity='cosine',
                                        affinity='precomputed',
                                        linkage='complete').fit(item['question_hat_distance'])
#             item['q_matrix_hat'].T)
        stats = anosim(dm, model.labels_, permutations=9999)
        stats_list.append(stats['test statistic'])
        p_values.append(stats['p-value'])
    stats_list = np.asarray(stats_list)
    p_values = np.asarray(p_values)
    if np.any(np.where(p_values < 0.1)):
        row['Statistical significant (p < 0.1)'] = True
        row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
        sig_stats = stats_list[np.where(p_values < 0.1)]
        row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
    else:
        row['Statistical significant (p < 0.1)'] = False
        row['Agg Concepts'] = '--'
        row['R Statistic'] = '--'
    return row

### Retrieve original Q-Matrix

In [5]:
%%time
q_matrix_original = get_original_q_matrix()
q_matrix_original.shape

CPU times: user 156 ms, sys: 5.72 ms, total: 161 ms
Wall time: 6.6 s


(54, 14)

### Retrieve automated Q-Matrix

In [6]:
data = []

In [7]:
%%time
exp_id = 26
cols = ["vectorizer", "min_df", "is_binary", "model"]
table = "experiments_solution"
where_items = get_where_items(exp_id, cols, table)[0]
v = eval(where_items[0])
m = where_items[1]
b = where_items[2]
vectorizer_params={'ngram_range': (1,3)}
train_data_features, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m, 
                                                         vectorizer_params=vectorizer_params)

model = where_items[3]
model_db = pickle.loads(base64.b64decode(model))
q_matrix_automated = model_db.transform(train_data_features)
# word_topic = model_db.components_.T
data.append(transform_data(q_matrix_original, q_matrix_automated))
q_matrix_automated.shape

CPU times: user 66 ms, sys: 4.54 ms, total: 70.6 ms
Wall time: 456 ms


(54, 12)

In [8]:
# Save automated Q-matrix
with open('data/tese/q_matrix_automated.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix_automated, pklfile)

### Retrieve refined Q-Matrix (automated after analysis)
- 1 concept per solution

In [119]:
cluster_set = list(set(clusters))
q_matrix_analysis = np.zeros((len(docs_id), len(set(clusters))))
for idx, cluster in enumerate(clusters):
    q_matrix_analysis[idx, cluster_set.index(cluster)]= 1

data.append(transform_data(q_matrix_original, q_matrix_analysis))

In [135]:
# Save analyzed Q-matrix
with open('data/tese/q_matrix_refined.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix_analysis, pklfile)

In [126]:
%%time
df = []
for item in data:
    row = calculate_anosim(item)
    df.append(row)

CPU times: user 31.5 s, sys: 0 ns, total: 31.5 s
Wall time: 31.5 s


In [127]:
results = pd.DataFrame(df)

In [128]:
results

Unnamed: 0,Statistical significant (p < 0.1),Agg Concepts,R Statistic
0,True,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...",0.20 at 18
1,True,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.18 at 19


In [129]:
item = data[1]
dm = DistanceMatrix(item['question_distance'])
row = {}
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='precomputed',
                                    linkage='complete').fit(item['question_hat_distance'].T)
    stats = anosim(dm, model.labels_, permutations=9999)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
stats_list = np.asarray(stats_list)
p_values = np.asarray(p_values)
if np.any(np.where(p_values < 0.1)):
    row['Statistical significant (p < 0.1)'] = True
    row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
    sig_stats = stats_list[np.where(p_values < 0.1)]
    row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
else:
    row['Statistical significant (p < 0.1)'] = False
    row['Agg Concepts'] = '--'
    row['R Statistic'] = '--'

In [130]:
stats_list

array([0.11479647, 0.09937463, 0.14540132, 0.11917605, 0.11783865,
       0.11710581, 0.12025834, 0.12719094, 0.1247419 , 0.1293008 ,
       0.14222523, 0.1538761 , 0.14789013, 0.14788238, 0.15228207,
       0.1445512 , 0.1430687 , 0.17900104])

In [131]:
p_values

array([0.0163, 0.0304, 0.0014, 0.0047, 0.0052, 0.007 , 0.0073, 0.0051,
       0.0088, 0.0074, 0.0051, 0.0056, 0.0062, 0.0075, 0.0078, 0.0106,
       0.0115, 0.0037])

In [132]:
stats_list[np.where(p_values  < 0.1)]

array([0.11479647, 0.09937463, 0.14540132, 0.11917605, 0.11783865,
       0.11710581, 0.12025834, 0.12719094, 0.1247419 , 0.1293008 ,
       0.14222523, 0.1538761 , 0.14789013, 0.14788238, 0.15228207,
       0.1445512 , 0.1430687 , 0.17900104])