In [1]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
folder = "data/lak2020"
N_RUNS = 10

### Organizing data

In [3]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test.shape)
    student_performance_test, X_test = add_to_tensor(data['test_set'], 
                                                     student_performance_test, 
                                                     X_test)
    return student_performance, X, student_performance_test, X_test

Do grid search to explore parameters

In [4]:
# %%time
mu = [0.01] + list(np.arange(0.05, 0.51, 0.05))
n_concepts = range(2, 26)
attempts = range(10, 51, 10)

In [5]:
search = 1 * N_RUNS * len(attempts) * len(mu) * len(n_concepts)
search

13200

In [6]:
hours = (184*search)/(60*60)
hours/24

28.11111111111111

In [7]:
def run(*args):
    dataset, data, att, student_performance, X, student_performance_test, X_test, del_questions, m, concept = args
    
    # Run FDTF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = feedback_driven_tensor_factorization(
        student_performance, n_concepts=concept, init=10, max_iter=1000, mu=m)
#                     student_performance, n_concepts=concept, init=1, max_iter=3, mu=m)
    end = time.time()
    q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)

    row = {
        "dataset": "2020_08_11_run_%d" % dataset,
        "q_matrix": q_matrix,
        "X": X,
        "sp": student_performance,
        "X_test": X_test,
        "sp_test": student_performance_test,
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
        "mu": m,
        "concepts": concept,
        "attempts_train": att,
        "method": "fdtf",
        "seconds": end-start,
        "del_questions": del_questions,
        "reconstruction_error": best_error
    }
    
    # Write PSQL query
    insert_query_base = "INSERT INTO LAK2020_2020_08_11 "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    query = cursor.mogrify(insert_query, query_values)
    cursor.execute(query)

In [8]:
args = []
for restart in range(1):
    for dataset in range(N_RUNS):
        with open("%s/2020_08_11_run_%d.pkl" % (folder, dataset), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X, student_performance_test, X_test = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            X = np.delete(X, del_questions, axis=1)
            student_performance = np.delete(student_performance, del_questions, axis=1)
            X_test = np.delete(X_test, del_questions, axis=1)
            student_performance_test = np.delete(student_performance_test, del_questions, axis=1)
            
            for m in mu:
                for concept in n_concepts:                
                    args.append([dataset, 
                                data, 
                                att, 
                                student_performance, 
                                X, 
                                student_performance_test, 
                                X_test, 
                                del_questions,
                                m,
                                concept])

In [9]:
len(args)

13200

In [12]:
args[1166]

[0, {'train_set': array([[119,  38,   0,   0],
         [119,  38,   1,   0],
         [ 10,  38,   0,   0],
         ...,
         [173,  38,   1,   0],
         [173,  38,   2,   0],
         [173,  38,   3,   0]]), 'test_set': array([[138,  39,   4,   0],
         [138,  39,   5,   0],
         [138,  39,   6,   0],
         ...,
         [173,  38,  11,   0],
         [173,  38,  12,   0],
         [173,  38,  13,   1]]), 'concept_idx': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 11, 12, 13, 14]), 'questions_idx': [3,
   4,
   5,
   6,
   7,
   11,
   12,
   13,
   14,
   15,
   20,
   28,
   38,
   40,
   41,
   49,
   62,
   63,
   64,
   68,
   69,
   74,
   75,
   76,
   77,
   78,
   84,
   104,
   105,
   106,
   109,
   110,
   111,
   113,
   114,
   115,
   116,
   117,
   118,
   120,
   121,
   124,
   125,
   126,
   129,
   130,
   131,
   132], 'users_idx': [385,
   229,
   276,
   74,
   92,
   399,
   213,
   402,
   250,
   407,
   48,
   208,
   162,
   71,
   512,

In [13]:
next_id = 1166

In [None]:
%%time
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

for item in args[next_id:]:
    run(*item)