In [21]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization

In [23]:
folder = "data/edm2020"
# N_RUNS = 5

### Organizing data

In [30]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
#     max_attempt = np.max(data['test_set'][:,2])+1
#     student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
#     X_test = np.zeros(student_performance_test.shape)
#     student_performance_test, X_test = add_to_tensor(data['test_set'], 
#                                                      student_performance_test, 
#                                                      X_test)
    return student_performance, X #, student_performance_test, X_test

Get best parameters

In [46]:
# %%time
#mu = [0.1, 0.5, 1, 3, 10]
mu = [0.1]
n_concepts = range(5, 10)
#attempts = [10, 20, 50, 100, 150, 200]
attempts = [50]

In [32]:
search = 1 * 1 * len(attempts) * len(mu) * len(n_concepts)
search

5

In [33]:
hours = (1000*search)/(60*60)
hours

1.3888888888888888

In [39]:
def run(*args):
    dataset, data, att, student_performance, X, del_questions, m, concept = args
    
    # Run FDTF
    start = time.time()
    (best_student_performance_pred, best_student_knowledge, 
     best_q_matrix, best_error) = feedback_driven_tensor_factorization(
        student_performance, n_concepts=concept, init=10, max_iter=1000, mu=m)
#                     student_performance, n_concepts=concept, init=1, max_iter=3, mu=m)
    end = time.time()

    if concept == 14:
        q_matrix = np.delete(data['q_matrix'], del_questions, axis=1)
        q_matrix_error = np.sqrt(np.sum(np.power(q_matrix-best_q_matrix, 2)))
    else:
        q_matrix = data['q_matrix']
        q_matrix_error = -1

    row = {
        "dataset": "run_%s" % dataset,
        "q_matrix": q_matrix,
        "X": X,
        "sp": student_performance,
        "X_test": [0],
        "sp_test": [0],
        "sp_hat": best_student_performance_pred,
        "sk_hat": best_student_knowledge,
        "q_matrix_hat": best_q_matrix,
        "mu": m,
        "concepts": concept,
        "attempts_train": att,
        "method": "fdtf",
        "q_matrix_error": q_matrix_error,
        "reconstruction_error": best_error,
        "train_error": 0,
        "test_error": 0,
        "seconds": end-start
    }

    # Write PSQL query
    insert_query_base = "INSERT INTO EDM2020_2020_02_19 "
    column_value = []
    insert_format = []
    query_values = []
    for col in row.keys():
        if isinstance(row[col], np.ndarray):
            query_values.append(row[col].tolist())
        else:
            query_values.append(row[col])
        column_value.append(col)
        insert_format.append("%s")

    insert_query = insert_query_base + "(" + ", ".join(column_value) + ") VALUES "
    insert_query += "(" + ", ".join(insert_format) + ")"
    query_values = tuple(query_values)
    
    # Connect to DB
    connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
    connection.autocommit=True
    cursor = connection.cursor()
    
    query = cursor.mogrify(insert_query, query_values)
    cursor.execute(query)

In [40]:
args = []
for restart in range(1):
        with open("%s/all_data.pkl" % (folder), "rb") as pklfile:
            data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            student_performance = np.delete(student_performance, del_questions, axis=1)
            
            for m in mu:
                for concept in n_concepts:                
                    args.append(['all', 
                                data, 
                                att, 
                                student_performance, 
                                X, 
                                del_questions,
                                m,
                                concept])

In [44]:
next_id = 0

In [45]:
# %%time
for item in args[next_id:]:
    run(*item)

OperationalError: SSL SYSCALL error: EOF detected
