In [1]:
# Helpers
import pickle
import numpy as np
import pandas as pd
import base64
from collections import defaultdict
from itertools import chain
import rpy2

# DB 
import psycopg2
from django.conf import settings

# Learning
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from skbio.stats.distance import anosim
from skbio import DistanceMatrix

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import analyzer
from tokenizer import create_bag_of_words
from vectorizer import NCutVectorizer

%load_ext rpy2.ipython

In [2]:
%%R
library(factoextra)
library(proxy)
library(permute)
library(lattice)
library(vegan)
library(tidyverse)

R[write to console]: Loading required package: ggplot2

R[write to console]: Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

R[write to console]: 
Attaching package: ‘proxy’


R[write to console]: The following objects are masked from ‘package:stats’:

    as.dist, dist


R[write to console]: The following object is masked from ‘package:base’:

    as.matrix


R[write to console]: This is vegan 2.5-6

R[write to console]: ── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

R[write to console]: [32m✔[39m [34mtibble [39m 3.0.1     [32m✔[39m [34mdplyr  [39m 1.0.0
[32m✔[39m [34mtidyr  [39m 1.1.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0
[32m✔[39m [34mpurrr  [39m 0.3.4     

R[write to console]: ── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter(

### Get solutions from DB

In [28]:
## Cleaning database
last_id = 132
# problems = Problem.objects.filter(id__gt=last_id)
# # solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
# print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('problem_id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []
clusters = []
questions_idx = []

# Fill separated structures
for sol in solutions_obj:
    questions_idx.append(sol.problem.id)
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)
    clusters.append(sol.cluster.id)

print("Got %d documents" %(solutions_obj.count()))

Problems to be used: 132
Solutions to be used: 54
Got 54 documents


In [29]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [30]:
def get_where_items(exp_id, cols, table):
    query = "SELECT %s from %s where experiment_id = %s" % (", ".join(cols), table, exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def get_original_q_matrix():
    # Get voted concepts per solution
    concepts = SolutionConcept.objects.all()
    MIN_THRESHOLD = 0.5
    agreed_concepts = defaultdict(list)
    agreed_concepts_len = {}
    
    # Count concept agreement per solution
    count_per_solution = dict(concepts.values_list('solution__id').annotate(count=Count('user', distinct=True)))

    # Filter out the concepts that didn't have agreement (50% of evaluators voted for it)
    for solution_id in docs_id:
        max_votes = count_per_solution[solution_id]
        concepts_per_solution = dict(concepts.filter(solution__id=solution_id).values_list('concept').annotate(
            count=Count("concept")))
        for concept, value in concepts_per_solution.items():
            if value >= (max_votes * MIN_THRESHOLD):
                agreed_concepts[solution_id].append(concept)
        agreed_concepts_len[solution_id] = len(agreed_concepts[solution_id])
        agreed_concepts_all = list(chain.from_iterable(agreed_concepts.values()))
        agreed_concepts_set = set(agreed_concepts_all)
        
    concept_idx = np.asarray(list(agreed_concepts_set))
    q_matrix = np.zeros((len(docs_id), len(concept_idx)))

    for q_idx, question_id in enumerate(docs_id):
        used_concepts = agreed_concepts[question_id]
        q_matrix[q_idx, np.where(np.isin(concept_idx, used_concepts))] = 1
    return q_matrix

def transform_data(q_matrix, q_matrix_hat):
    data = {}
    # Calculate similarities among questions in Q-Matrix and estimated Q-Matrix
    data['question_similarity'] = cosine_similarity(q_matrix)
    data['question_distance'] = cosine_distances(q_matrix)
    data['question_hat_similarity'] = cosine_similarity(q_matrix_hat)
    data['question_hat_distance'] = cosine_distances(q_matrix_hat)
    error = data['question_similarity'] - data['question_hat_similarity']

    # Calculate total error, RMSE and CMD
    data['error'] = np.sqrt(np.sum(np.power(error,2)))
    data['rmse'] = np.sqrt(np.mean(np.power(error,2)))
    qs = data['question_similarity']
    qs_hat = data['question_hat_similarity']
    data['cmd'] = 1-np.trace(np.dot(qs,qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
    return data

def calculate_anosim(item):
    row = {}
#     row['Experiment ID'] = item['exp_id']
#     row['Method'] = "%s (%d attempts)" % (item['method'].upper(), item['attempts'])
#     row['Concept'] = item['concepts']
    dm = DistanceMatrix(item['question_distance'])
    stats_list = []
    p_values = []
    for i in range(2, 40):
        model = AgglomerativeClustering(n_clusters=i, 
#                                         affinity='cosine',
                                        affinity='precomputed',
                                        linkage='complete').fit(item['question_hat_distance'])
#             item['q_matrix_hat'].T)
        stats = anosim(dm, model.labels_, permutations=9999)
        stats_list.append(stats['test statistic'])
        p_values.append(stats['p-value'])
    stats_list = np.asarray(stats_list)
    p_values = np.asarray(p_values)
    if np.any(np.where(p_values < 0.1)):
        row['Statistical significant (p < 0.1)'] = True
        row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
        sig_stats = stats_list[np.where(p_values < 0.1)]
        row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
    else:
        row['Statistical significant (p < 0.1)'] = False
        row['Agg Concepts'] = '--'
        row['R Statistic'] = '--'
    return row

### Retrieve original Q-Matrix

In [31]:
%%time
q_matrix_original = get_original_q_matrix()
q_matrix_original.shape

CPU times: user 166 ms, sys: 4.03 ms, total: 170 ms
Wall time: 7.15 s


(54, 14)

In [32]:
# Save automated Q-matrix
with open('data/tese/q_matrix_original.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix_original, pklfile)
np.savetxt("data/tese/q_matrix_original.csv", q_matrix_original, delimiter=",")

### Retrieve automated Q-Matrix

In [33]:
data = []

In [34]:
%%time
exp_id = 26
cols = ["vectorizer", "min_df", "is_binary", "model"]
table = "experiments_solution"
where_items = get_where_items(exp_id, cols, table)[0]
v = eval(where_items[0])
m = where_items[1]
b = where_items[2]
vectorizer_params={'ngram_range': (1,3)}
train_data_features, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m, 
                                                         vectorizer_params=vectorizer_params)

model = where_items[3]
model_db = pickle.loads(base64.b64decode(model))
q_matrix_automated = model_db.transform(train_data_features)
# word_topic = model_db.components_.T
data.append(transform_data(q_matrix_original, q_matrix_automated))
q_matrix_automated.shape

CPU times: user 51 ms, sys: 0 ns, total: 51 ms
Wall time: 2.16 s


(54, 12)

In [40]:
# Save questions ids
with open('data/tese/train_data_features.pkl', 'wb') as pklfile:
    pickle.dump(train_data_features, pklfile)

In [35]:
# Save questions ids
with open('data/tese/questions_idx.pkl', 'wb') as pklfile:
    pickle.dump(questions_idx, pklfile)

In [36]:
# Save automated Q-matrix
with open('data/tese/q_matrix_automated.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix_automated, pklfile)

### Retrieve refined Q-Matrix (automated after analysis)
- 1 concept per solution

In [12]:
cluster_set = list(set(clusters))
q_matrix_analysis = np.zeros((len(docs_id), len(set(clusters))))
for idx, cluster in enumerate(clusters):
    q_matrix_analysis[idx, cluster_set.index(cluster)]= 1

data.append(transform_data(q_matrix_original, q_matrix_analysis))

In [13]:
# Save analyzed Q-matrix
with open('data/tese/q_matrix_refined.pkl', 'wb') as pklfile:
    pickle.dump(q_matrix_analysis, pklfile)

In [14]:
%%time
df = []
for item in data:
    row = calculate_anosim(item)
    df.append(row)

CPU times: user 1min 20s, sys: 86.9 ms, total: 1min 20s
Wall time: 1min 21s


In [15]:
results = pd.DataFrame(df)

In [16]:
results

Unnamed: 0,Statistical significant (p < 0.1),Agg Concepts,R Statistic
0,True,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...",0.45 at 36
1,True,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.28 at 26


In [17]:
%%time
item = data[0]
dm = DistanceMatrix(item['question_distance'])
row = {}
stats_list = []
p_values = []
for i in range(2, 50):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='precomputed',
                                    linkage='complete').fit(item['question_hat_distance'].T)
    stats = anosim(dm, model.labels_, permutations=9999)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
stats_list = np.asarray(stats_list)
p_values = np.asarray(p_values)
if np.any(np.where(p_values < 0.1)):
    row['Statistical significant (p < 0.1)'] = True
    row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
    sig_stats = stats_list[np.where(p_values < 0.1)]
    row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
else:
    row['Statistical significant (p < 0.1)'] = False
    row['Agg Concepts'] = '--'
    row['R Statistic'] = '--'

CPU times: user 56.6 s, sys: 88 ms, total: 56.7 s
Wall time: 56.7 s


In [18]:
stats_list

array([-0.09239082,  0.00778208, -0.01370905,  0.07522632,  0.09214763,
        0.11787716,  0.14372703,  0.14617127,  0.15467822,  0.18518857,
        0.18560334,  0.19081734,  0.17799965,  0.17821524,  0.19632512,
        0.18886006,  0.19639434,  0.19478641,  0.20161662,  0.2031313 ,
        0.25118278,  0.25021716,  0.28054846,  0.29420365,  0.30536793,
        0.30152154,  0.31298456,  0.3505234 ,  0.3383922 ,  0.34301988,
        0.36702849,  0.35814844,  0.34904215,  0.36677037,  0.4510502 ,
        0.44531117,  0.34797782,  0.35420731,  0.3891619 ,  0.37883089,
        0.35393448,  0.34367382,  0.29615927,  0.28457106,  0.21653765,
        0.056922  ,  0.10002006,  0.19312763])

In [19]:
p_values

array([8.332e-01, 4.000e-01, 5.777e-01, 3.920e-02, 1.950e-02, 6.100e-03,
       1.200e-03, 2.600e-03, 1.700e-03, 8.000e-04, 6.000e-04, 1.200e-03,
       1.700e-03, 3.500e-03, 2.000e-03, 2.000e-03, 1.800e-03, 3.100e-03,
       3.300e-03, 2.500e-03, 8.000e-04, 1.400e-03, 3.000e-04, 3.000e-04,
       3.000e-04, 5.000e-04, 1.000e-04, 2.000e-04, 3.000e-04, 7.000e-04,
       4.000e-04, 6.000e-04, 1.000e-03, 7.000e-04, 1.000e-04, 5.000e-04,
       2.800e-03, 6.900e-03, 3.400e-03, 5.700e-03, 9.600e-03, 1.910e-02,
       4.510e-02, 6.280e-02, 1.331e-01, 3.873e-01, 3.165e-01, 2.244e-01])

In [20]:
np.where(p_values  < 0.1)

(array([ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
        37, 38, 39, 40, 41, 42, 43]),)

In [21]:
stats_list[np.where(p_values  < 0.1)]

array([0.07522632, 0.09214763, 0.11787716, 0.14372703, 0.14617127,
       0.15467822, 0.18518857, 0.18560334, 0.19081734, 0.17799965,
       0.17821524, 0.19632512, 0.18886006, 0.19639434, 0.19478641,
       0.20161662, 0.2031313 , 0.25118278, 0.25021716, 0.28054846,
       0.29420365, 0.30536793, 0.30152154, 0.31298456, 0.3505234 ,
       0.3383922 , 0.34301988, 0.36702849, 0.35814844, 0.34904215,
       0.36677037, 0.4510502 , 0.44531117, 0.34797782, 0.35420731,
       0.3891619 , 0.37883089, 0.35393448, 0.34367382, 0.29615927,
       0.28457106])

In [22]:
q_matrix_hat = q_matrix_automated
q_matrix = q_matrix_original
q_matrix_hat.shape

(54, 12)

In [23]:
%%R -i q_matrix -i q_matrix_hat -o anosim_data
# data <- read.csv('q_matrix_automated.csv', header=FALSE)
# data_original <-read.csv('q_matrix_original.csv', header=FALSE)

d <- dist(q_matrix_hat, method = "euclidean")
d_original <- dist(q_matrix, method = "jaccard")
res.hc <- hclust(d=d, method = "ward.D2")

stats <- c()
stats$k <- c()
stats$R <- c()
stats$signif <- c()

for (k in c(2:15))
{
  print(k)
  clusterCut <- cutree(res.hc, k)
  a <- anosim(d_original, clusterCut, permutations = 10000)
  stats$k <- c(stats$k, k)
  stats$R <- c(stats$R, a$statistic)
  stats$signif <- c(stats$signif, a$signif)
}

anosim_data <- data.frame(stats)
anosim_data %>% filter(signif < 0.01)

[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
    k         R     signif
1   5 0.1409537 0.00109989
2   6 0.1225760 0.00699930
3   7 0.1710049 0.00039996
4   8 0.1910186 0.00019998
5   9 0.1936974 0.00049995
6  10 0.1960702 0.00039996
7  11 0.1982769 0.00029997
8  12 0.2064476 0.00079992
9  13 0.2064535 0.00109989
10 14 0.2067765 0.00119988
11 15 0.1874218 0.00299970


In [24]:
anosim_automated = anosim_data
anosim_automated

Unnamed: 0,k,R,signif
1,2,0.004897,0.447755
2,3,0.032715,0.167183
3,4,-0.017161,0.632037
4,5,0.140954,0.0011
5,6,0.122576,0.006999
6,7,0.171005,0.0004
7,8,0.191019,0.0002
8,9,0.193697,0.0005
9,10,0.19607,0.0004
10,11,0.198277,0.0003


In [25]:
q_matrix_hat = q_matrix_analysis
q_matrix = q_matrix_original
q_matrix_hat.shape

(54, 5)

In [26]:
%%R -i q_matrix -i q_matrix_hat -o anosim_data
# data <- read.csv('q_matrix_automated.csv', header=FALSE)
# data_original <-read.csv('q_matrix_original.csv', header=FALSE)

d <- dist(q_matrix_hat, method = "euclidean")
d_original <- dist(q_matrix, method = "jaccard")
res.hc <- hclust(d=d, method = "ward.D2")

stats <- c()
stats$k <- c()
stats$R <- c()
stats$signif <- c()

for (k in c(2:15))
{
  print(k)
  clusterCut <- cutree(res.hc, k)
  a <- anosim(d_original, clusterCut, permutations = 10000)
  stats$k <- c(stats$k, k)
  stats$R <- c(stats$R, a$statistic)
  stats$signif <- c(stats$signif, a$signif)
}

anosim_data <- data.frame(stats)
anosim_data %>% filter(signif < 0.01)

[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13
[1] 14
[1] 15
    k         R     signif
1   3 0.1392252 0.00049995
2   4 0.1263954 0.00159984
3   5 0.1180005 0.00599940
4   6 0.1177108 0.00689931
5   7 0.1209506 0.00449955
6   8 0.1265794 0.00399960
7   9 0.1258535 0.00559944
8  10 0.1370063 0.00559944
9  11 0.1464021 0.00359964
10 12 0.1402808 0.00439956
11 13 0.1387233 0.00829917
12 14 0.1578091 0.00469953
13 15 0.1516825 0.00369963
