In [294]:
# Helpers
import pickle
import numpy as np
import pandas as pd

# DB 
import psycopg2
from django.conf import settings

# Learning
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from skbio.stats.distance import anosim
from skbio import DistanceMatrix

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

In [295]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [304]:
def get_fdtf_exps(concepts_min, concepts_max, att):
    
    # Get experiment ids based on condition
    values = (concepts_min, concepts_max, att)
    query = """select experiment_id from edm2020_2020_02_19 
where method='fdtf' and concepts >= %s and concepts <= %s and mu = '0.1' and attempts_train = %s
and dataset = 'run_all'  order by concepts"""
    query_test = cursor.mogrify(query, values)
    cursor.execute(query_test)
    rows = cursor.fetchall()
    experiments_train = []
    for row in rows:
        experiments_train.append(row[0])
        
    # Get experiments results
    query = """select X, q_matrix, q_matrix_hat, concepts, method, experiment_id, attempts_train,
    train_rmse, test_rmse
    from EDM2020_2020_02_19 where experiment_id in %s """

    query = cursor.mogrify(query, (tuple(experiments_train),))
    cursor.execute(query)
    response = cursor.fetchall()
    return response

def get_nmf_exps(concepts, alpha_min, alpha_max, att):
    
    # Get experiment ids based on condition
    values = (concepts, alpha_min, alpha_max, att)
    query = """select experiment_id from edm2020_2020_02_19 
where method='nmf2' and concepts = %s and alpha >= %s and alpha <= %s and l1_ratio = 0 and 
attempts_train = %s and dataset = 'run_all' order by concepts"""
    query_test = cursor.mogrify(query, values)
    cursor.execute(query_test)
    rows = cursor.fetchall()
    experiments_train = []
    for row in rows:
        experiments_train.append(row[0])
        
    # Get experiments results
    query = """select sp, q_matrix, q_matrix_hat, concepts, method, experiment_id, attempts_train,
    train_rmse, test_rmse
    from EDM2020_2020_02_19 where experiment_id in %s """

    query = cursor.mogrify(query, (tuple(experiments_train),))
    cursor.execute(query)
    response = cursor.fetchall()
    return response

def transform_fdtf_data(exps):
    data = {}
    for idx, row in enumerate(exps):
        (X, q_matrix, q_matrix_hat, concepts, method, experiment_id, 
         attempts, train_rmse, test_rmse) = row
        X = np.asarray(X)
        q_matrix_hat = np.asarray(q_matrix_hat)
#         q_matrix_hat = preprocessing.scale(q_matrix_hat)
        data[idx] = {}
        data[idx]['exp_id'] = experiment_id
        data[idx]['method'] = method
        data[idx]['concepts'] = concepts
        data[idx]['attempts'] = attempts
        data[idx]['train_rmse'] = train_rmse
        data[idx]['test_rmse'] = test_rmse
#         print("%d - %s - %d" % (data[idx]['exp_id'], data[idx]['method'], data[idx]['attempts']))

        # Fix original Q-Matrix to have the same number of questions
        attempts_per_question = X.sum(axis=0).sum(axis=1)
        del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
        q_matrix = np.delete(q_matrix, del_questions, axis=1)
        q_matrix = preprocessing.scale(q_matrix)

        data[idx]['q_matrix'] = np.asarray(q_matrix)
        data[idx]['q_matrix_hat'] = np.asarray(q_matrix_hat)

        # Calculate similarities among questions in Q-Matrix and estimated Q-Matrix
        data[idx]['question_similarity'] = cosine_similarity(q_matrix.T)
        data[idx]['question_distance'] = cosine_distances(q_matrix.T)
        data[idx]['question_hat_similarity'] = cosine_similarity(q_matrix_hat.T)
        data[idx]['question_hat_distance'] = cosine_distances(q_matrix_hat.T)
        error = data[idx]['question_similarity'] - data[idx]['question_hat_similarity']

        # Calculate total error, RMSE and CMD
        data[idx]['error'] = np.sqrt(np.sum(np.power(error,2)))
        data[idx]['rmse'] = np.sqrt(np.mean(np.power(error,2)))
        qs = data[idx]['question_similarity']
        qs_hat = data[idx]['question_hat_similarity']
        data[idx]['cmd'] = 1-np.trace(np.dot(qs,
                                             qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
#         print(data[idx]['cmd'])
    return data.values()

def transform_nmf_data(exps):
    data = {}
    for idx, row in enumerate(exps):
        (sp, q_matrix, q_matrix_hat, concepts, method, experiment_id, 
         attempts, train_rmse, test_rmse) = row
        student_performance = np.asarray(sp)
        q_matrix_hat = np.asarray(q_matrix_hat)
#         q_matrix_hat = preprocessing.scale(q_matrix_hat)
        data[idx] = {}
        data[idx]['exp_id'] = experiment_id
        data[idx]['method'] = method
        data[idx]['concepts'] = concepts
        data[idx]['attempts'] = attempts
        data[idx]['train_rmse'] = train_rmse
        data[idx]['test_rmse'] = test_rmse
#         print("%d - %s - %d" % (data[idx]['exp_id'], data[idx]['method'], data[idx]['attempts']))

        # Fix original Q-Matrix to have the same number of questions
        attempts_per_question = student_performance.sum(axis=0)
        del_questions = [idx for idx,value in enumerate(attempts_per_question) if value == 0]
        q_matrix = np.delete(q_matrix, del_questions, axis=1)
        q_matrix = preprocessing.scale(q_matrix)

        data[idx]['q_matrix'] = np.asarray(q_matrix)
        data[idx]['q_matrix_hat'] = np.asarray(q_matrix_hat)

        # Calculate similarities among questions in Q-Matrix and estimated Q-Matrix
        data[idx]['question_similarity'] = cosine_similarity(q_matrix.T)
        data[idx]['question_distance'] = cosine_distances(q_matrix.T)
        data[idx]['question_hat_similarity'] = cosine_similarity(q_matrix_hat.T)
        data[idx]['question_hat_distance'] = cosine_distances(q_matrix_hat.T)
        error = data[idx]['question_similarity'] - data[idx]['question_hat_similarity']

        # Calculate total error, RMSE and CMD
        data[idx]['error'] = np.sqrt(np.sum(np.power(error,2)))
        data[idx]['rmse'] = np.sqrt(np.mean(np.power(error,2)))
        qs = data[idx]['question_similarity']
        qs_hat = data[idx]['question_hat_similarity']
        data[idx]['cmd'] = 1-np.trace(np.dot(qs,
                                             qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
#         print(data[idx]['cmd'])
    return data.values()

def calculate_anosim(item):
    row = {}
    row['Experiment ID'] = item['exp_id']
    row['Method'] = "%s (%d attempts)" % (item['method'].upper(), item['attempts'])
    row['Concept'] = item['concepts']
    dm = DistanceMatrix(item['question_distance'])
    stats_list = []
    p_values = []
    for i in range(2, 20):
        model = AgglomerativeClustering(n_clusters=i, 
#                                         affinity='cosine',
                                        affinity='precomputed',
                                        linkage='complete').fit(item['question_hat_distance'])
#             item['q_matrix_hat'].T)
        stats = anosim(dm, model.labels_, permutations=9999)
        stats_list.append(stats['test statistic'])
        p_values.append(stats['p-value'])
    stats_list = np.asarray(stats_list)
    p_values = np.asarray(p_values)
    if np.any(np.where(p_values < 0.1)):
        row['Statistical significant (p < 0.1)'] = True
        row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
        sig_stats = stats_list[np.where(p_values < 0.1)]
        row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
    else:
        row['Statistical significant (p < 0.1)'] = False
        row['Agg Concepts'] = '--'
        row['R Statistic'] = '--'
    return row

In [305]:
%%time
exps_fdtf = get_fdtf_exps(3, 5, 20)
exps_fdtf += get_fdtf_exps(5, 13, 50)
exps_fdtf += get_fdtf_exps(11, 16, 150)
data = list(transform_fdtf_data(exps_fdtf))

CPU times: user 4.1 s, sys: 272 ms, total: 4.37 s
Wall time: 11.9 s


In [306]:
%%time
exps = get_nmf_exps(3, 0, 0.2, 0)
exps += get_nmf_exps(7, 0.2, 0.4, 1)
exps += get_nmf_exps(3, 0, 0.1, 1)
data += list(transform_nmf_data(exps))

CPU times: user 216 ms, sys: 16 ms, total: 232 ms
Wall time: 1.39 s


In [307]:
df = pd.DataFrame(data).set_index('exp_id')
df = df[['method', 'concepts', 'attempts', 'train_rmse', 'cmd']]

In [308]:
with pd.option_context('precision', 3):
    print(df)

       method  concepts  attempts  train_rmse    cmd
exp_id                                              
86860    fdtf         3        20       0.436  0.568
86863    fdtf         4        20       0.438  0.570
86864    fdtf         5        20       0.438  0.612
86866    fdtf        10        50       0.439  0.530
86868    fdtf        11        50       0.440  0.536
86870    fdtf        12        50       0.442  0.523
86871    fdtf         5        50       0.434  0.545
86872    fdtf        13        50       0.443  0.512
86873    fdtf         6        50       0.435  0.562
86874    fdtf         7        50       0.436  0.553
86875    fdtf         8        50       0.438  0.562
86877    fdtf         9        50       0.437  0.536
86781    fdtf        11       150       0.437  0.503
86855    fdtf        12       150       0.436  0.517
86859    fdtf        13       150       0.438  0.527
86869    fdtf        14       150       0.435  0.540
86876    fdtf        15       150       0.437 

In [301]:
%%time
df = []
for item in data:
    row = calculate_anosim(item)
    df.append(row)

CPU times: user 7min 21s, sys: 1.33 s, total: 7min 23s
Wall time: 7min 23s


In [302]:
results = pd.DataFrame(df)

In [303]:
results

Unnamed: 0,Experiment ID,Method,Concept,Statistical significant (p < 0.1),Agg Concepts,R Statistic
0,86860,FDTF (20 attempts),3,False,--,--
1,86863,FDTF (20 attempts),4,False,--,--
2,86864,FDTF (20 attempts),5,False,--,--
3,86866,FDTF (50 attempts),10,False,--,--
4,86868,FDTF (50 attempts),11,False,--,--
5,86870,FDTF (50 attempts),12,False,--,--
6,86871,FDTF (50 attempts),5,True,[3],0.06 at 3
7,86872,FDTF (50 attempts),13,True,"[15, 16, 19]",0.13 at 19
8,86873,FDTF (50 attempts),6,False,--,--
9,86874,FDTF (50 attempts),7,False,--,--


In [244]:
results

Unnamed: 0,Experiment ID,Method,Concept,Statistical significant (p < 0.1),Agg Concepts,R Statistic
0,86860,Fdtf (20 attemts),3,False,--,--
1,86863,Fdtf (20 attemts),4,True,[5],0.07 at 5
2,86864,Fdtf (20 attemts),5,False,--,--
3,86866,Fdtf (50 attemts),10,False,--,--
4,86868,Fdtf (50 attemts),11,False,--,--
5,86870,Fdtf (50 attemts),12,True,[3],0.07 at 3
6,86871,Fdtf (50 attemts),5,True,"[3, 4]",0.08 at 4
7,86872,Fdtf (50 attemts),13,True,"[14, 15, 17, 18, 19]",0.18 at 19
8,86873,Fdtf (50 attemts),6,False,--,--
9,86874,Fdtf (50 attemts),7,False,--,--


In [275]:
item = data[7]

In [276]:
dm = DistanceMatrix(item['question_distance'])
row = {}
row['Experiment ID'] = item['exp_id']
row['Method'] = "%s (%d attemts)" % (item['method'].capitalize(), item['attempts'])
row['Concept'] = item['concepts']
dm = DistanceMatrix(item['question_distance'])
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='precomputed',
                                    linkage='complete').fit(item['question_hat_distance'].T)
    stats = anosim(dm, model.labels_, permutations=9999)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
stats_list = np.asarray(stats_list)
p_values = np.asarray(p_values)
if np.any(np.where(p_values < 0.1)):
    row['Statistical significant (p < 0.1)'] = True
    row['Agg Concepts'] = np.where(p_values < 0.1)[0]+2
#         row['R Statistic'] = np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)]
    sig_stats = stats_list[np.where(p_values < 0.1)]
    row['R Statistic'] = "%.2f at %d" % (np.max(sig_stats), np.where(stats_list == np.max(sig_stats))[0]+2)
else:
    row['Statistical significant (p < 0.1)'] = False
    row['Agg Concepts'] = '--'
    row['R Statistic'] = '--'

In [277]:
stats_list

array([ 0.03033212,  0.00518083, -0.03990993, -0.03634074,  0.01508722,
        0.01575805, -0.02873211,  0.00262672,  0.0110309 ,  0.02755572,
        0.06251338,  0.06699156,  0.10405077,  0.1170319 ,  0.08852335,
        0.12886827,  0.15677083,  0.18068392])

In [278]:
p_values

array([0.2114, 0.396 , 0.8493, 0.8043, 0.3476, 0.3514, 0.6885, 0.4539,
       0.4005, 0.3051, 0.1548, 0.1551, 0.074 , 0.0588, 0.1223, 0.059 ,
       0.0321, 0.0206])

In [282]:
stats_list[np.where(p_values  < 0.1)]

array([0.10405077, 0.1170319 , 0.12886827, 0.15677083, 0.18068392])

In [280]:
row

{'Experiment ID': 86872,
 'Method': 'Fdtf (50 attemts)',
 'Concept': 13,
 'Statistical significant (p < 0.1)': True,
 'Agg Concepts': array([14, 15, 17, 18, 19]),
 'R Statistic': '0.18 at 19'}