In [28]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization

In [29]:
folder = "data/edm2020"
N_RUNS = 10

### Organizing data

In [30]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test.shape)
    student_performance_test, X_test = add_to_tensor(data['test_set'], 
                                                     student_performance_test, 
                                                     X_test)
    return student_performance, X, student_performance_test, X_test

Do grid search to explore parameters

In [31]:
# %%time
mu = [0.1]
n_concepts = range(11, 16)
#attempts = [10, 20, 50, 100, 150, 200]
attempts = [150]
#00, 150]

In [32]:
search = 1 * N_RUNS * len(attempts) * len(mu) * len(n_concepts)
search

50

In [33]:
hours = (800*search)/(60*60)
hours

11.11111111111111

In [17]:
def run(*args):
    dataset, data, att, student_performance, X, student_performance_test, X_test, del_questions, m, concept = args
    
    # Run FDTF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = feedback_driven_tensor_factorization(
        student_performance, n_concepts=concept, init=10, max_iter=1000, mu=m)
#                     student_performance, n_concepts=concept, init=1, max_iter=3, mu=m)
    end = time.time()
    q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)

    row = {
        "dataset": "2020_06_08_run_%d" % dataset,
        "q_matrix": q_matrix,
        "X": X,
        "sp": student_performance,
        "X_test": X_test,
        "sp_test": student_performance_test,
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
        "mu": m,
        "concepts": concept,
        "attempts_train": att,
        "method": "fdtf",
        "seconds": end-start,
        "del_questions": del_questions,
        "reconstruction_error": best_error
    }
    
    # Write PSQL query
    insert_query_base = "INSERT INTO EDM2020_2020_06_05 "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    query = cursor.mogrify(insert_query, query_values)
    cursor.execute(query)

In [36]:
args = []
for restart in range(1):
    #for dataset in range(0,1):
    #for dataset in range(N_RUNS):
    for dataset in [7]:
        with open("%s/2020_06_08_run_%d.pkl" % (folder, dataset), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X, student_performance_test, X_test = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            X = np.delete(X, del_questions, axis=1)
            student_performance = np.delete(student_performance, del_questions, axis=1)
            X_test = np.delete(X_test, del_questions, axis=1)
            student_performance_test = np.delete(student_performance_test, del_questions, axis=1)
            
            for m in mu:
                for concept in n_concepts:                
                    args.append([dataset, 
                                data, 
                                att, 
                                student_performance, 
                                X, 
                                student_performance_test, 
                                X_test, 
                                del_questions,
                                m,
                                concept])

In [37]:
len(args)

5

In [27]:
args[30]

[6, {'train_set': array([[ 59,  38,   0,   0],
         [ 59,  38,   1,   1],
         [ 21,  44,   0,   1],
         ...,
         [173,  38,   0,   0],
         [173,  38,   1,   0],
         [173,  38,   2,   0]]), 'test_set': array([[138,  19,   3,   0],
         [138,  19,   4,   0],
         [138,  19,   5,   0],
         ...,
         [173,  38,   4,   0],
         [173,  38,   5,   0],
         [173,  38,   6,   0]]), 'concept_idx': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]), 'questions_idx': [3,
   4,
   5,
   6,
   7,
   11,
   12,
   13,
   14,
   15,
   20,
   28,
   38,
   40,
   41,
   49,
   62,
   63,
   64,
   68,
   69,
   74,
   75,
   76,
   77,
   78,
   84,
   104,
   105,
   106,
   109,
   110,
   111,
   113,
   114,
   115,
   116,
   117,
   118,
   120,
   121,
   124,
   125,
   126,
   129,
   130,
   131,
   132], 'users_idx': [119,
   222,
   385,
   512,
   276,
   268,
   257,
   210,
   198,
   244,
   104,
   113,
   121,
   50,


In [38]:
next_id = 0

In [39]:
%%time
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

for item in args[next_id:]:
    run(*item)

CPU times: user 1d 2h 18min 21s, sys: 1h 48min 25s, total: 1d 4h 6min 47s
Wall time: 1h 27min 56s
