In [5]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization
from sklearn.cluster import AgglomerativeClustering

# Evaluation
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from skbio.stats.distance import anosim
from skbio import DistanceMatrix

In [17]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [55]:
experiments_train = (
    # Best FDTF train_rmse 
    189, 
    277, 
    # Best NMF train_rmse with 1 attempt (so we can calculate accuracy) and overall
    52473,
    52126)
experiments_q_matrix = (
    # Best FDTF Q-Matrix rmse
    274,
    17805,
    # Best NMF Q-Matrix rmse,
    37977,
    38013)

In [69]:
query = """select q_matrix, q_matrix_hat, concepts, method, experiment_id 
from EDM2020_2020_02_19 where experiment_id in %s or experiment_id in %s"""
# query = """select q_matrix, q_matrix_hat, q_matrix_rmse from edm2020_2020_02_19 
# where method='fdtf' order by q_matrix_rmse"""

query = cursor.mogrify(query, (experiments_train, experiments_q_matrix))
cursor.execute(query)
response = cursor.fetchall()

In [70]:
data = {}
for idx, row in enumerate(response):
    data[idx] = {}
    data[idx]['exp_id'] = row[-1]
    data[idx]['method'] = row[-2]
    if data[idx]['exp_id'] in experiments_train:
        data[idx]['type'] = 'train_rmse'
    else:
        data[idx]['type'] = 'q_matrix_rmse'
    data[idx]['concepts'] = row[2]
    print("%d - %s - %s" % (data[idx]['exp_id'], data[idx]['method'], data[idx]['type']))
    data[idx]['q_matrix'] = np.asarray(row[0])
    q_matrix = data[idx]['q_matrix']
#     row_sums = data[idx]['q_matrix'].sum(axis=0)
#     data[idx]['q_matrix_norm'] = data[idx]['q_matrix'] / row_sums
    data[idx]['q_matrix_hat'] = np.asarray(row[1])
    q_matrix_hat = data[idx]['q_matrix_hat']
#     row_sums = data[idx]['q_matrix_hat'].sum(axis=0)
#     data[idx]['q_matrix_norm_hat'] = data[idx]['q_matrix_hat'] / row_sums
    data[idx]['question_similarity'] = cosine_similarity(q_matrix.T)
    data[idx]['question_distance'] = cosine_distances(q_matrix.T)
    data[idx]['question_hat_similarity'] = cosine_similarity(q_matrix_hat.T)
    data[idx]['question_hat_distance'] = cosine_distances(q_matrix_hat.T)
    error = data[idx]['question_similarity'] - data[idx]['question_hat_similarity']
    data[idx]['error'] = np.sqrt(np.sum(np.power(error,2)))
    data[idx]['rmse'] = np.sqrt(np.mean(np.power(error,2)))
    qs = data[idx]['question_similarity']
    qs_hat = data[idx]['question_hat_similarity']
    data[idx]['cmd'] = 1-np.trace(np.dot(qs,
                                         qs_hat))/(np.linalg.norm(qs)*np.linalg.norm(qs_hat))
    print(data[idx]['cmd'])

189 - fdtf - train_rmse
0.2695369725247976
277 - fdtf - train_rmse
0.2793747805528337
38013 - nmf - q_matrix_rmse
0.24081451825830236
52126 - nmf2 - train_rmse
0.47994553793538286
52473 - nmf2 - train_rmse
0.4843123088472069
37977 - nmf - q_matrix_rmse
0.24075690645224213
274 - fdtf - q_matrix_rmse
0.21209076317635522
17805 - fdtf - q_matrix_rmse
0.22043665170266746


In [58]:
import pandas as pd

In [71]:
df = pd.DataFrame.from_dict(data, orient='index').set_index('exp_id')
df = df[['method', 'concepts', 'type', 'error', 'rmse', 'cmd']]

In [72]:
df

Unnamed: 0_level_0,method,concepts,type,error,rmse,cmd
exp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
189,fdtf,6,train_rmse,17.723227,0.369234,0.269537
277,fdtf,4,train_rmse,21.516879,0.448268,0.279375
38013,nmf,19,q_matrix_rmse,14.42163,0.300451,0.240815
52126,nmf2,19,train_rmse,18.980277,0.395422,0.479946
52473,nmf2,19,train_rmse,19.116635,0.398263,0.484312
37977,nmf,19,q_matrix_rmse,14.420258,0.300422,0.240757
274,fdtf,19,q_matrix_rmse,13.892834,0.289434,0.212091
17805,fdtf,19,q_matrix_rmse,13.913421,0.289863,0.220437


In [65]:
from sklearn.cluster import AgglomerativeClustering

In [73]:
for idx in range(len(data)):
    print("%d - %d - %s - %s - %d" % (idx, data[idx]['exp_id'], data[idx]['method'], 
                                      data[idx]['type'], data[idx]['concepts']))
    print(data[idx]['cmd'])
    models = []
    dm = DistanceMatrix(data[idx]['question_distance'])
    stats_list = []
    p_values = []
    for i in range(2, 20):
        model = AgglomerativeClustering(n_clusters=i, 
                                        affinity='cosine',
                                        linkage='single').fit(data[idx]['q_matrix_hat'].T)
    #     model = SpectralClustering(n_clusters=i, n_init=100, affinity='precomputed').fit(
    #         data[idx]['question_hat_similarity'].T)
    #     model = SpectralClustering(n_clusters=i, n_init=100, affinity='rbf').fit(
    #         data[idx]['q_matrix_hat'].T)
    #     model = KMeans(n_clusters=i, n_init=100).fit(data[idx]['q_matrix_hat'].T)
        models.append(model)
        stats = anosim(dm, model.labels_, permutations=9999)
        stats_list.append(stats['test statistic'])
        p_values.append(stats['p-value'])
    print(np.where(np.asarray(p_values) < 0.1)[0]+1)
    print(np.asarray(stats_list)[np.where(np.asarray(p_values) < 0.1)])
    print()

0 - 189 - fdtf - train_rmse - 6
0.2695369725247976
[ 8  9 10 11 12 13 14 15 18]
[0.10027299 0.12299792 0.14646025 0.15043686 0.1174806  0.11575566
 0.12521221 0.08632037 0.11006786]

1 - 277 - fdtf - train_rmse - 4
0.2793747805528337
[10 13 16 17 18]
[0.0855395  0.08336955 0.08954729 0.09763461 0.11275464]

2 - 38013 - nmf - q_matrix_rmse - 19
0.24081451825830236
[]
[]

3 - 52126 - nmf2 - train_rmse - 19
0.47994553793538286
[]
[]

4 - 52473 - nmf2 - train_rmse - 19
0.4843123088472069
[2 3]
[0.19853514 0.16394379]

5 - 37977 - nmf - q_matrix_rmse - 19
0.24075690645224213
[]
[]

6 - 274 - fdtf - q_matrix_rmse - 19
0.21209076317635522
[]
[]

7 - 17805 - fdtf - q_matrix_rmse - 19
0.22043665170266746
[11 12 13 14 15 16 17 18]
[0.16499201 0.18144813 0.17765664 0.15394678 0.15511072 0.12896669
 0.17364735 0.17354688]



In [378]:
stats_list

[0.031143933460146832,
 -0.07877606756022211,
 -0.04087854087854093,
 -0.03789377766206707,
 -0.04706930746140639,
 0.039631190169772366,
 0.04229587095851908,
 0.0758107956458309,
 0.07643988701952605,
 0.04786588582884877,
 0.038074348649297304,
 0.05036541643684499,
 0.05218367418528789,
 0.019998600097993036,
 0.038588804980688216,
 0.035056068484912876,
 0.026571418228330694,
 0.005022954361328564]

In [351]:
p_values

[0.12,
 0.42,
 0.28,
 0.56,
 0.73,
 0.42,
 0.63,
 0.59,
 0.42,
 0.72,
 0.74,
 0.86,
 0.59,
 0.6,
 0.67,
 0.68,
 0.52,
 0.51]

### Organizing data

In [3]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test.shape)
    student_performance_test, X_test = add_to_tensor(data['test_set'], 
                                                     student_performance_test, 
                                                     X_test)
    return student_performance, X, student_performance_test, X_test

Do grid search to explore parameters

In [5]:
# %%time
# mu = [0.1, 0.5, 1, 3, 10]
mu = [3]
# n_concepts = range(2, 20)
n_concepts = [14]
# attempts = [10, 20, 50, 100, 150, 200]
attempts = [10]

In [24]:
search = 1 * N_RUNS * len(attempts) * len(mu) * len(n_concepts)
search

2700

In [18]:
def run(*args):
    dataset, data, att, student_performance, X, student_performance_test, X_test, del_questions, m, concept = args
    
    # Run FDTF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = feedback_driven_tensor_factorization(
        student_performance, n_concepts=concept, init=10, max_iter=1000, mu=m)
#                     student_performance, n_concepts=concept, init=1, max_iter=3, mu=m)
    end = time.time()

    if concept == 14:
        q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)
        q_matrix_error = np.sqrt(np.sum(np.power(q_matrix-best_q_matrix, 2)))
    else:
        q_matrix = data['q_matrix']
        q_matrix_error = -1

    row = {
        "dataset": "run_%d" % dataset,
        "q_matrix": q_matrix,
        "X": X,
        "sp": student_performance,
        "X_test": X_test,
        "sp_test": student_performance_test,
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
        "mu": m,
        "concepts": concept,
        "attempts_train": att,
        "method": "fdtf",
        "q_matrix_error": q_matrix_error,
        "reconstruction_error": best_error,
        "train_error": 0,
        "test_error": 0,
        "seconds": end-start
    }
    
    return row

In [19]:
args = []
for restart in range(1):
    for dataset in range(N_RUNS):
        with open("%s/run_%d.pkl" % (folder, dataset), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X, student_performance_test, X_test = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            student_performance = np.delete(student_performance, del_questions, axis=1)
            
            for m in mu:
                for concept in n_concepts:                
                    args.append([dataset, 
                                data, 
                                att, 
                                student_performance, 
                                X, 
                                student_performance_test, 
                                X_test, 
                                del_questions,
                                m,
                                concept])

In [20]:
len(args)

5

In [21]:
%%time
row = run(*args[0])

CPU times: user 2min 9s, sys: 3.93 s, total: 2min 13s
Wall time: 1min 7s


In [25]:
rmse = np.sqrt(np.mean(np.power(row['q_matrix']-row['q_matrix_hat'], 2)))
rmse

0.4616764283646856