In [83]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization
from sklearn.cluster import AgglomerativeClustering

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from skbio.stats.distance import anosim
from skbio import DistanceMatrix

In [9]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [84]:
experiments = (189, 277, 37977)

In [85]:
query = "select q_matrix, q_matrix_hat from EDM2020_2020_02_19 where experiment_id in %s"
# query = """select q_matrix, q_matrix_hat, q_matrix_rmse from edm2020_2020_02_19 
# where method='fdtf' order by q_matrix_rmse"""

query = cursor.mogrify(query, (experiments,))
cursor.execute(query)
response = cursor.fetchall()

In [86]:
data = {}
for idx, row in enumerate(response):
    data[idx] = {}
    data[idx]['q_matrix'] = np.asarray(row[0])
    q_matrix = data[idx]['q_matrix']
#     row_sums = data[idx]['q_matrix'].sum(axis=0)
#     data[idx]['q_matrix_norm'] = data[idx]['q_matrix'] / row_sums
    data[idx]['q_matrix_hat'] = np.asarray(row[1])
    q_matrix_hat = data[idx]['q_matrix_hat']
#     row_sums = data[idx]['q_matrix_hat'].sum(axis=0)
#     data[idx]['q_matrix_norm_hat'] = data[idx]['q_matrix_hat'] / row_sums
    data[idx]['question_similarity'] = cosine_similarity(q_matrix.T)
    data[idx]['question_distance'] = cosine_distances(q_matrix.T)
    data[idx]['question_hat_similarity'] = cosine_similarity(q_matrix_hat.T)
    data[idx]['question_hat_distance'] = cosine_distances(q_matrix_hat.T)
    error = data[idx]['question_similarity'] - data[idx]['question_hat_similarity']
    data[idx]['error'] = np.sqrt(np.sum(np.power(error,2)))
    data[idx]['rmse'] = np.sqrt(np.mean(np.power(error,2)))
    qs = data[idx]['question_similarity']
    qs_hat = data[idx]['question_hat_similarity']
    data[idx]['cmd'] = 1-np.trace(np.dot(qs,
                                         qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
    print(data[idx]['cmd'])

0.2695369725247976
0.2793747805528337
0.24075690645224213


In [87]:
from sklearn.cluster import AgglomerativeClustering

In [88]:
idx = 0
models = []
dm = DistanceMatrix(data[idx]['question_distance'])
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='cosine',
                                    linkage='single').fit(data[idx]['q_matrix_hat'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='precomputed').fit(
#         data[idx]['question_hat_similarity'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='rbf').fit(
#         data[idx]['q_matrix_hat'].T)
#     model = KMeans(n_clusters=i, n_init=100).fit(data[idx]['q_matrix_hat'].T)
    models.append(model)
    stats = anosim(dm, model.labels_, permutations=9999)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
print(np.where(np.asarray(p_values) < 0.1)[0]+1)
print(np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)])

[ 8  9 10 11 12 13 14 18]
[0.10027299 0.12299792 0.14646025 0.15043686 0.1174806  0.11575566
 0.12521221 0.11006786]


In [89]:
idx = 2
models = []
dm = DistanceMatrix(data[idx]['question_distance'])
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='cosine',
                                    linkage='single').fit(data[idx]['q_matrix_hat'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='precomputed').fit(
#         data[idx]['question_hat_similarity'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='rbf').fit(
#         data[idx]['q_matrix_hat'].T)
#     model = KMeans(n_clusters=i, n_init=100).fit(data[idx]['q_matrix_hat'].T)
    models.append(model)
    stats = anosim(dm, model.labels_, permutations=9999)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
print(np.where(np.asarray(p_values) < 0.1)[0]+1)
print(np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)])

[]
[]


In [90]:
stats_list

[0.18769067254512173,
 0.08505866216128705,
 0.036692321460533514,
 -0.004287668350168297,
 -0.021524610083062713,
 0.009258333796065065,
 -0.02640568996415773,
 -0.020257531109031744,
 0.015747852121843292,
 0.05151975296039196,
 -0.008258513153261886,
 -0.0008737977394821186,
 -0.04519212909437583,
 0.030133493856449867,
 0.02377321298889937,
 0.05850471866096875,
 0.0646559736171488,
 0.05830777868958651]

In [91]:
p_values

[0.1653,
 0.1869,
 0.3269,
 0.4937,
 0.5853,
 0.4398,
 0.623,
 0.5869,
 0.4,
 0.2311,
 0.5365,
 0.4886,
 0.7496,
 0.3026,
 0.3493,
 0.1872,
 0.1747,
 0.1972]

In [464]:
idx = 2
models = []
dm = DistanceMatrix(data[idx]['question_distance'])
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='cosine',
                                    linkage='average').fit(data[idx]['q_matrix_hat'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='precomputed').fit(
#         data[idx]['question_hat_similarity'].T)
#     model = SpectralClustering(n_clusters=i, n_init=100, affinity='rbf').fit(
#         data[idx]['q_matrix_hat'].T)
    models.append(model)
    stats = anosim(dm, model.labels_, permutations=91)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])
print(np.where(np.asarray(p_values) < 0.05)[0]+1)
print(np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.05)])

[]
[]


In [418]:
stats_list

[0.17010577313311695,
 0.1426028378026554,
 0.13331563899518548,
 0.11522183076635278,
 0.07442137185940223,
 0.11257716830280326,
 0.09026458794977302,
 0.08995071193866362,
 0.08655704464087711,
 0.042275406322557545,
 0.04635984023238917,
 0.05368276742788458,
 0.05990725654843966,
 0.05960116731517505,
 0.057459877154745265,
 0.051156419050666535,
 0.0042554679848902125,
 0.03593536942959545]

In [373]:
p_values

[0.796,
 0.909,
 0.834,
 0.905,
 0.831,
 0.414,
 0.393,
 0.352,
 0.403,
 0.524,
 0.859,
 0.903,
 0.847,
 0.624,
 0.507,
 0.394,
 0.555,
 0.606]

In [377]:
models = []
dm = DistanceMatrix(data[2]['question_distance'])
stats_list = []
p_values = []
for i in range(2, 20):
    model = AgglomerativeClustering(n_clusters=i, 
                                    affinity='cosine',
                                    linkage='average').fit(data[2]['q_matrix_hat'].T)
    models.append(model)
    stats = anosim(dm, model.labels_, permutations=99)
    stats_list.append(stats['test statistic'])
    p_values.append(stats['p-value'])

In [378]:
stats_list

[0.031143933460146832,
 -0.07877606756022211,
 -0.04087854087854093,
 -0.03789377766206707,
 -0.04706930746140639,
 0.039631190169772366,
 0.04229587095851908,
 0.0758107956458309,
 0.07643988701952605,
 0.04786588582884877,
 0.038074348649297304,
 0.05036541643684499,
 0.05218367418528789,
 0.019998600097993036,
 0.038588804980688216,
 0.035056068484912876,
 0.026571418228330694,
 0.005022954361328564]

In [379]:
np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.05)]

array([], dtype=float64)

In [351]:
p_values

[0.12,
 0.42,
 0.28,
 0.56,
 0.73,
 0.42,
 0.63,
 0.59,
 0.42,
 0.72,
 0.74,
 0.86,
 0.59,
 0.6,
 0.67,
 0.68,
 0.52,
 0.51]

In [274]:
from skbio.stats.distance import anosim, permanova
from skbio import DistanceMatrix

In [275]:
dm = DistanceMatrix(data[0]['question_distance'])

In [337]:
s = anosim(dm, clustering.labels_, permutations=999)
s['test statistic']

0.13919760575912002

In [336]:
s

method name                 ANOSIM
test statistic name              R
sample size                     47
number of groups                10
test statistic            0.139198
p-value                      0.012
number of permutations         999
Name: ANOSIM results, dtype: object

### Organizing data

In [3]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test.shape)
    student_performance_test, X_test = add_to_tensor(data['test_set'], 
                                                     student_performance_test, 
                                                     X_test)
    return student_performance, X, student_performance_test, X_test

Do grid search to explore parameters

In [5]:
# %%time
# mu = [0.1, 0.5, 1, 3, 10]
mu = [3]
# n_concepts = range(2, 20)
n_concepts = [14]
# attempts = [10, 20, 50, 100, 150, 200]
attempts = [10]

In [24]:
search = 1 * N_RUNS * len(attempts) * len(mu) * len(n_concepts)
search

2700

In [18]:
def run(*args):
    dataset, data, att, student_performance, X, student_performance_test, X_test, del_questions, m, concept = args
    
    # Run FDTF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = feedback_driven_tensor_factorization(
        student_performance, n_concepts=concept, init=10, max_iter=1000, mu=m)
#                     student_performance, n_concepts=concept, init=1, max_iter=3, mu=m)
    end = time.time()

    if concept == 14:
        q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)
        q_matrix_error = np.sqrt(np.sum(np.power(q_matrix-best_q_matrix, 2)))
    else:
        q_matrix = data['q_matrix']
        q_matrix_error = -1

    row = {
        "dataset": "run_%d" % dataset,
        "q_matrix": q_matrix,
        "X": X,
        "sp": student_performance,
        "X_test": X_test,
        "sp_test": student_performance_test,
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
        "mu": m,
        "concepts": concept,
        "attempts_train": att,
        "method": "fdtf",
        "q_matrix_error": q_matrix_error,
        "reconstruction_error": best_error,
        "train_error": 0,
        "test_error": 0,
        "seconds": end-start
    }
    
    return row

In [19]:
args = []
for restart in range(1):
    for dataset in range(N_RUNS):
        with open("%s/run_%d.pkl" % (folder, dataset), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X, student_performance_test, X_test = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            student_performance = np.delete(student_performance, del_questions, axis=1)
            
            for m in mu:
                for concept in n_concepts:                
                    args.append([dataset, 
                                data, 
                                att, 
                                student_performance, 
                                X, 
                                student_performance_test, 
                                X_test, 
                                del_questions,
                                m,
                                concept])

In [20]:
len(args)

5

In [21]:
%%time
row = run(*args[0])

CPU times: user 2min 9s, sys: 3.93 s, total: 2min 13s
Wall time: 1min 7s


In [25]:
rmse = np.sqrt(np.mean(np.power(row['q_matrix']-row['q_matrix_hat'], 2)))
rmse

0.4616764283646856