In [207]:
import pandas as pd
import pickle
from tokenizer import create_bag_of_words
import psycopg2
from sklearn.feature_extraction.text import CountVectorizer
import base64
from sklearn.cluster import KMeans
import time
from collections import defaultdict
from vectorizer import NCutVectorizer
import numpy as np

In [208]:
## Cleaning database
last_id = 132
# problems = Problem.objects.filter(id__gt=last_id)
# # solutions_obj = Solution.objects.filter(problem__in=problems).update(ignore=True)
# print("Problems to be ignored: %d" % problems.count())

problems = Problem.objects.filter(id__lte=last_id)
# problems = Problem.objects.all()
print("Problems to be used: %d" % problems.count())

solutions_obj = Solution.objects.filter(problem__in=problems, ignore=False).order_by('id')
# solutions_obj = Solution.objects.all().order_by('id')
print("Solutions to be used: %d" % solutions_obj.count())

docs_id = []
questions = []
solutions = []

# Fill separated structures
for sol in solutions_obj:
    docs_id.append(sol.id)
    questions.append(sol.problem.content)
    solutions.append(sol.content)

print("Got %d documents" %(solutions_obj.count()))

Problems to be used: 132
Solutions to be used: 54
Got 54 documents


In [209]:
%%time 
# Insert to DB
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                          password = settings.DATABASES["default"]["PASSWORD"],
                          host = settings.DATABASES["default"]["HOST"],
                          port = settings.DATABASES["default"]["PORT"],
                          database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 796 ms


In [210]:
def get_where_items(exp_id, cols, table):
    query = "SELECT %s from %s where experiment_id = %s" % (", ".join(cols), table, exp_id) 
    cursor.execute(query)
    where_items = cursor.fetchall()
    return where_items

def get_model_properties(model_pickle, data_features):
    model_db = pickle.loads(base64.b64decode(model_pickle))
    document_topic = model_db.transform(data_features)
    document_clusters = document_topic.argmax(axis=1)
    return document_clusters, model_db.components_.T

def calculate_agreement(clusters):
    cluster_list = []
    for _, row in df_75.iterrows():
        idx = np.argwhere(np.asarray(docs_id) == row['solution'])
        cluster_list.append(clusters[idx[0][0]])
    df = pd.concat([df_75, pd.Series(cluster_list)], axis=1)
    df.columns=['solution', 'concept', 'cluster']
    # Solutions per cluster
    count_df = df.groupby('cluster')['solution'].nunique()
    # Cluster per concept
    max_df = df.groupby(['cluster', 'concept']).count().max(level=0)
    return (max_df.sum()/count_df.sum()).tolist()[0]

In [211]:
concepts = SolutionConcept.objects.all()

solution_100_df = []
solution_75_df = []
# Calculate concepts for each solution
for idx_solution, sol in enumerate(solutions_obj):
        concept_solution = concepts.filter(solution=sol)
        user_count = concept_solution.values_list('solution', 'user__first_name').annotate(count=Count(
            'user', distinct=True)).count()
        count_per_problem = concept_solution.values_list('concept__label', 'solution').annotate(count=Count('concept'))
        allowed_100_concepts = count_per_problem.filter(count__gte=user_count).values_list('concept', flat=True)
        allowed_75_concepts = count_per_problem.filter(count__gte=user_count*0.75).values_list('concept', flat=True)
        for c in allowed_100_concepts:
            solution_100_df.append([sol.pk, c])
        for c in allowed_75_concepts:
            solution_75_df.append([sol.pk, c])
            
df_100 = pd.DataFrame(solution_100_df, columns=['solution', 'concept'])
df_75 = pd.DataFrame(solution_75_df, columns=['solution', 'concept'])

In [212]:
agreement = defaultdict(list)

### Agreement for LDA with augmented tokenizer

In [213]:
exp_id = 26
cols = ["vectorizer", "min_df", "is_binary", "method", "k"]
table = "experiments_solution"
where_items = get_where_items(exp_id, cols, table)[0]
v = eval(where_items[0])
m = where_items[1]
b = where_items[2]
vectorizer_params={'ngram_range': (1,3)}
train_data_features, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m, 
                                                         vectorizer_params=vectorizer_params)

In [214]:
select_query = "select model, " + ", ".join(cols) + """ coherence_med5, coherence_med10, coherence_med15 
FROM EXPERIMENTS_2020_02_12 where vectorizer = %s  and min_df = '%s' and is_binary = %s and
method = %s and k = %s and ngrams[1] = 1 and ngrams[2] = 3"""
query = cursor.mogrify(select_query, where_items)
query

b"select model, vectorizer, min_df, is_binary, method, k coherence_med5, coherence_med10, coherence_med15 \nFROM EXPERIMENTS_2020_02_12 where vectorizer = 'CountVectorizer'  and min_df = '0.05' and is_binary = true and\nmethod = 'lda' and k = 12 and ngrams[1] = 1 and ngrams[2] = 3"

In [215]:
cursor.execute(query)
response = cursor.fetchall()

key = 'lda'
for idx, row in enumerate(response):
    document_clusters, word_topic = get_model_properties(row[0], train_data_features)
    agreement_value = calculate_agreement(document_clusters)
    agreement[key].append(agreement_value)
    print("%d - %.2f (%.2f)" % (idx, np.mean(agreement[key]), np.std(agreement[key])))

0 - 0.62 (0.00)
1 - 0.67 (0.05)
2 - 0.67 (0.04)
3 - 0.67 (0.04)
4 - 0.69 (0.05)
5 - 0.69 (0.04)
6 - 0.70 (0.04)
7 - 0.70 (0.04)
8 - 0.70 (0.04)
9 - 0.70 (0.04)
10 - 0.69 (0.03)
11 - 0.69 (0.03)
12 - 0.69 (0.04)
13 - 0.69 (0.04)
14 - 0.70 (0.05)
15 - 0.69 (0.05)
16 - 0.69 (0.05)
17 - 0.69 (0.05)
18 - 0.69 (0.04)
19 - 0.69 (0.04)
20 - 0.69 (0.04)
21 - 0.69 (0.04)
22 - 0.69 (0.05)
23 - 0.69 (0.04)
24 - 0.69 (0.04)
25 - 0.69 (0.04)
26 - 0.69 (0.04)
27 - 0.69 (0.04)
28 - 0.69 (0.04)
29 - 0.69 (0.04)
30 - 0.69 (0.04)
31 - 0.69 (0.04)
32 - 0.69 (0.04)
33 - 0.69 (0.05)
34 - 0.69 (0.05)
35 - 0.69 (0.04)
36 - 0.69 (0.04)
37 - 0.68 (0.05)
38 - 0.68 (0.05)
39 - 0.68 (0.05)
40 - 0.68 (0.05)
41 - 0.68 (0.05)
42 - 0.69 (0.05)
43 - 0.69 (0.04)
44 - 0.69 (0.05)
45 - 0.69 (0.05)
46 - 0.69 (0.05)
47 - 0.69 (0.05)
48 - 0.69 (0.05)
49 - 0.69 (0.05)
50 - 0.69 (0.05)
51 - 0.69 (0.05)
52 - 0.69 (0.05)
53 - 0.69 (0.05)
54 - 0.69 (0.04)
55 - 0.69 (0.04)
56 - 0.69 (0.04)
57 - 0.69 (0.04)
58 - 0.69 (0.04)
59 - 0.

### Agreement for KMeans with augmented tokenizer

In [205]:
key = 'kmeans'
agreement[key] = []
for idx in range(100):
    start = time.time()
    model = KMeans(n_clusters=5)
    clusters_train = model.fit_predict(train_data_features)
    agreement_value = calculate_agreement(clusters_train)
    agreement[key].append(agreement_value)
    print("%d - %.2f (%.2f)" % (idx, np.mean(agreement[key]), np.std(agreement[key])))

0 - 0.72 (0.00)
1 - 0.73 (0.01)
2 - 0.74 (0.02)
3 - 0.74 (0.01)
4 - 0.74 (0.01)
5 - 0.74 (0.02)
6 - 0.74 (0.02)
7 - 0.74 (0.01)
8 - 0.74 (0.01)
9 - 0.74 (0.02)
10 - 0.74 (0.02)
11 - 0.73 (0.02)
12 - 0.74 (0.02)
13 - 0.74 (0.02)
14 - 0.73 (0.03)
15 - 0.74 (0.03)
16 - 0.74 (0.03)
17 - 0.73 (0.03)
18 - 0.73 (0.03)
19 - 0.73 (0.03)
20 - 0.74 (0.03)
21 - 0.73 (0.03)
22 - 0.74 (0.03)
23 - 0.74 (0.03)
24 - 0.74 (0.03)
25 - 0.74 (0.03)
26 - 0.74 (0.03)
27 - 0.74 (0.03)
28 - 0.74 (0.03)
29 - 0.74 (0.03)
30 - 0.74 (0.03)
31 - 0.74 (0.03)
32 - 0.74 (0.03)
33 - 0.74 (0.03)
34 - 0.74 (0.03)
35 - 0.74 (0.03)
36 - 0.74 (0.03)
37 - 0.74 (0.03)
38 - 0.74 (0.03)
39 - 0.74 (0.03)
40 - 0.74 (0.03)
41 - 0.74 (0.03)
42 - 0.74 (0.03)
43 - 0.74 (0.03)
44 - 0.74 (0.03)
45 - 0.74 (0.03)
46 - 0.74 (0.03)
47 - 0.74 (0.03)
48 - 0.74 (0.03)
49 - 0.74 (0.03)
50 - 0.74 (0.03)
51 - 0.74 (0.03)
52 - 0.74 (0.03)
53 - 0.74 (0.03)
54 - 0.74 (0.03)
55 - 0.74 (0.03)
56 - 0.74 (0.03)
57 - 0.74 (0.03)
58 - 0.74 (0.03)
59 - 0.

### Agreement for NMF with augmented tokenizer

In [186]:
exp_id = 1463
cols = ["vectorizer", "min_df", "is_binary", "method", "k"]
table = "experiments_solution"
where_items = get_where_items(exp_id, cols, table)[0]
v = eval(where_items[0])
m = where_items[1]
b = where_items[2]
vectorizer_params={'ngram_range': (1,3)}
train_data_features, vectorizer, _ = create_bag_of_words(solutions, v, binary=b, min_df=m, vectorizer_params=vectorizer_params)

In [187]:
select_query = "select model, " + ", ".join(cols) + """ coherence_med5, coherence_med10, coherence_med15 
FROM EXPERIMENTS_2020_02_12 where vectorizer = %s  and min_df = '%s' and is_binary = %s and
method = %s and k = %s and ngrams[1] = 1 and ngrams[2] = 3"""
query = cursor.mogrify(select_query, where_items)
query

b"select model, vectorizer, min_df, is_binary, method, k coherence_med5, coherence_med10, coherence_med15 \nFROM EXPERIMENTS_2020_02_12 where vectorizer = 'NCutVectorizer'  and min_df = '0.35' and is_binary = true and\nmethod = 'nmf' and k = 7 and ngrams[1] = 1 and ngrams[2] = 3"

In [188]:
cursor.execute(query)
response = cursor.fetchall()

key = 'nmf'
for idx, row in enumerate(response):
    document_clusters, word_topic = get_model_properties(row[0], train_data_features)
    agreement_value = calculate_agreement(document_clusters)
    agreement[key].append(agreement_value)
    print("%d - %.2f (%.2f)" % (idx, np.mean(agreement[key]), np.std(agreement[key])))

0 - 0.62 (0.00)
1 - 0.69 (0.07)
2 - 0.65 (0.07)
3 - 0.68 (0.08)
4 - 0.67 (0.08)
5 - 0.64 (0.10)
6 - 0.63 (0.09)
7 - 0.63 (0.09)
8 - 0.63 (0.08)
9 - 0.63 (0.08)
10 - 0.62 (0.08)
11 - 0.64 (0.09)
12 - 0.65 (0.09)
13 - 0.65 (0.09)
14 - 0.65 (0.09)
15 - 0.65 (0.09)
16 - 0.64 (0.09)
17 - 0.64 (0.08)
18 - 0.65 (0.09)
19 - 0.65 (0.09)
20 - 0.65 (0.09)
21 - 0.65 (0.09)
22 - 0.65 (0.09)
23 - 0.65 (0.09)
24 - 0.65 (0.09)
25 - 0.66 (0.09)
26 - 0.66 (0.09)
27 - 0.66 (0.09)
28 - 0.67 (0.09)
29 - 0.67 (0.09)
30 - 0.67 (0.09)
31 - 0.67 (0.09)
32 - 0.67 (0.09)
33 - 0.67 (0.09)
34 - 0.66 (0.09)
35 - 0.67 (0.09)
36 - 0.66 (0.09)
37 - 0.66 (0.09)
38 - 0.66 (0.09)
39 - 0.66 (0.09)
40 - 0.66 (0.09)
41 - 0.66 (0.08)
42 - 0.66 (0.08)
43 - 0.65 (0.09)
44 - 0.65 (0.09)
45 - 0.65 (0.09)
46 - 0.65 (0.09)
47 - 0.65 (0.08)
48 - 0.65 (0.08)
49 - 0.65 (0.08)
50 - 0.65 (0.08)
51 - 0.65 (0.08)
52 - 0.65 (0.08)
53 - 0.65 (0.08)
54 - 0.65 (0.08)
55 - 0.65 (0.08)
56 - 0.65 (0.08)
57 - 0.65 (0.08)
58 - 0.65 (0.08)
59 - 0.

### Agreement for regular tokenizer

In [189]:
def create_regular_bag_of_words(docs, vectorizer_method, binary=False, min_df=0.2, vectorizer_params=None):
    vectorizer = vectorizer_method(analyzer = 'word',
                                   binary=binary,
                                   min_df=min_df,
                                   **vectorizer_params)
    train_data_features = vectorizer.fit_transform(docs)
    try:
        train_data_features = train_data_features.toarray()
    # It's already an array
    except AttributeError:
        pass
    return train_data_features, vectorizer, vectorizer.get_feature_names()

In [191]:
agreement

defaultdict(list,
            {'lda': [0.6226415094339622,
              0.7169811320754716,
              0.660377358490566,
              0.6981132075471698,
              0.7547169811320755,
              0.6981132075471698,
              0.7358490566037735,
              0.6792452830188679,
              0.7169811320754716,
              0.6792452830188679,
              0.6792452830188679,
              0.6981132075471698,
              0.6037735849056604,
              0.7735849056603774,
              0.7547169811320755,
              0.6226415094339622,
              0.6792452830188679,
              0.6981132075471698,
              0.6792452830188679,
              0.6792452830188679,
              0.7547169811320755,
              0.6415094339622641,
              0.6226415094339622,
              0.6981132075471698,
              0.6415094339622641,
              0.7169811320754716,
              0.6981132075471698,
              0.6415094339622641,
              0.73584905

In [206]:
regular_query = """select vectorizer, min_df, is_binary, ngrams, method, k FROM EXPERIMENTS_REGULAR_2020_02_18 
group by  vectorizer, min_df, is_binary, ngrams, method, k
having avg(coherence_med5) <> 'NaN'
order by avg(coherence_med5) desc """

cursor.execute(regular_query)
where_items = cursor.fetchall()

i = 0
item = where_items[i]
agreement['regular_%d' % i] = []
#for i, item in enumerate(where_items):
print(i, item)
v = eval(item[0])
m = item[1]
b = item[2]
vectorizer_params={'ngram_range': item[3]}
#k = item[5]
k = 5
train_data_features, vectorizer, _ = create_regular_bag_of_words(solutions, v, binary=b, min_df=m,
                                                                 vectorizer_params=vectorizer_params)

select_query = "select model, " + ", ".join(cols) + """, coherence_med5, coherence_med10, coherence_med15 
FROM EXPERIMENTS_REGULAR_2020_02_04 where vectorizer = %s  and min_df = '%s' and is_binary = %s and
method = %s and k = %s and ngrams[1] = %s and ngrams[2] = %s"""
query = cursor.mogrify(select_query, (item[0], m, b, item[4], k, item[3][0], item[3][1]))
cursor.execute(query)
response = cursor.fetchall()

key = 'regular_%d' % i

for idx, row in enumerate(response):
    document_clusters, word_topic = get_model_properties(row[0], train_data_features)
    agreement_value = calculate_agreement(document_clusters)
    agreement[key].append(agreement_value)
    print("%d - %.2f (%.2f)" % (idx, np.mean(agreement[key]), np.std(agreement[key])))

0 ('CountVectorizer', 0.1, True, [1, 3], 'lda', 15)
0 - 0.66 (0.00)
1 - 0.62 (0.04)
2 - 0.62 (0.03)
3 - 0.64 (0.05)
4 - 0.63 (0.05)
5 - 0.63 (0.04)
6 - 0.63 (0.04)
7 - 0.64 (0.05)
8 - 0.64 (0.05)
9 - 0.64 (0.05)


In [200]:
from scipy import stats
result = stats.mannwhitneyu(agreement["lda"], agreement["regular_0"])
result

MannwhitneyuResult(statistic=886.0, pvalue=2.422635480303876e-24)

In [201]:
from scipy import stats
result = stats.mannwhitneyu(agreement["lda"], agreement["nmf"])
result

MannwhitneyuResult(statistic=3522.0, pvalue=0.00014537307237890104)