In [1]:
# Helpers
import pickle
import numpy as np
import time

# DB 
import psycopg2
from django.conf import settings

# Learning
from fdtf import feedback_driven_tensor_factorization

In [2]:
folder = "data/edm2020"
N_RUNS = 5

In [5]:
attempts = [10, 20, 50, 100, 150, 200]

In [9]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

### Organizing data

In [6]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data, n_attempts):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    
    student_performance = np.zeros((N_STUDENTS, N_QUESTIONS, n_attempts))
    X = np.zeros(student_performance.shape)

    student_performance, X = add_to_tensor(data['train_set'], 
                                           student_performance, 
                                           X, filter_attempt=n_attempts)
    
    
    max_attempt = np.max(data['test_set'][:,2])+1
    student_performance_test = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_test = np.zeros(student_performance_test.shape)
    student_performance_test, X_test = add_to_tensor(data['test_set'], 
                                                     student_performance_test, 
                                                     X_test)
    return student_performance, X, student_performance_test, X_test

In [22]:
update_queries = []
total = 0
for dataset in range(1):
    with open("%s/run_%d.pkl" % (folder, dataset), "rb") as pklfile:
        data = pickle.load(pklfile)
        for att in attempts:
            # Get train and test data in FDTF format
            student_performance, X, student_performance_test, X_test = transform_data(data, att)
            # Check if questions have minimum number of attempts
            attempts_per_question = X.sum(axis=0).sum(axis=1)
            del_questions = [idx for idx,value in enumerate(attempts_per_question) if value < 3]
            X = np.delete(X, del_questions, axis=1)
            student_performance = np.delete(student_performance, del_questions, axis=1)
            X_test = np.delete(X_test, del_questions, axis=1)
            student_performance_test = np.delete(student_performance_test, del_questions, axis=1)

            update = {
                "x_test": X_test,
                "sp_test": student_performance_test
            }

            # Write PSQL query
            update_query = "UPDATE EDM2020_2020_02_19 SET "
            update_list = []
            query_values = []
            for key, value in update.items():
                update_list.append(key + "= %s")
                if isinstance(value, np.ndarray):
                    query_values.append(value.tolist())
                else:
                    query_values.append(value)
            update_query += ", ".join(update_list) + "where dataset = %s and attempts_train = %s"
            query_values.append('run_%d' % dataset)
            query_values.append('%d' % att)
            query_values = tuple(query_values)
            query = cursor.mogrify(update_query, query_values)
            update_queries.append(query)
            total += 1

In [23]:
len(update_queries)

1

In [24]:
for query in update_queries:
    cursor.execute(query)