In [1]:
# Helpers
from collections import Counter, defaultdict
import numpy as np
from itertools import chain
import pickle

#DB
from django.db.models import Case, IntegerField, Value

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# DB 
import psycopg2
from django.conf import settings

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


In [2]:
folder = "data/edm2020/"

### Get student data

In [3]:
# Filter used problems and solutions
last_id = 132
problems = Problem.objects.filter(id__lte=last_id)

# Get all students
user_blacklist = UserProfile.objects.filter(professor__user__username='sem_professor')

attempts = UserLog.objects.exclude(outcome='S').filter(
    timestamp__lt=OuterRef('timestamp'), user__id=OuterRef('user__id')).annotate(
    attempt=Count('*')).values('attempt')

# Remove group by values
attempts.query.set_group_by()

# Get users
users = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').exclude(
    user__userprofile__in=user_blacklist).annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0.1)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score"#"outcome", "timestamp"
).order_by("timestamp").filter(attempt=1).values_list('user_id', flat=True)

data = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score", #"outcome", #"timestamp"
).order_by("timestamp").filter(user__in=users)

Create train set after 20% of students attempts to compare cold-start problem

In [4]:
N_RUNS = 10
TRAIN_ATTEMPTS_TEST = 0.2

In [5]:
def get_user_idx(user_id):
    return users_idx.index(user_id)

def get_question_idx(question_id):
    return questions_idx.index(question_id)

map_user_id = np.vectorize(get_user_idx)    
map_question_id = np.vectorize(get_question_idx)

In [6]:
%%time
data_coldstart = []
for i in range(N_RUNS):
    # Open data
    with open("%s/2020_06_08_run_%d.pkl" % (folder, i), "rb") as pklfile:
        dataset = pickle.load(pklfile)
    test_users = []
    # Remove test users and get max attempt
    max_attempt = 0
    for row in dataset['test_set']:
        test_users.append(row[0])
        if row[2] > max_attempt:
            max_attempt = row[2]
    train_users = set(range(len(dataset['users_idx']))) - set(test_users)
    
    # Create "train set" to test with students only after up to 20% of their attempts
    # In the same way the test users were chosen
    train_set = []
    for idx, user in enumerate(train_users):
        user_attempts = data.filter(user_id=dataset['users_idx'][user])
        train_attempts = int(np.ceil(TRAIN_ATTEMPTS_TEST*user_attempts.count()))
        train_set.extend(list(user_attempts.filter(attempt__gt=train_attempts).order_by('attempt')))
        
    # Mapping users and questions to be in 0-len index
    users_idx = dataset['users_idx']
    questions_idx = dataset['questions_idx']
    train_set = np.asarray(train_set)
    train_set[:,0] = map_user_id(train_set[:,0])
    train_set[:,1] = map_question_id(train_set[:,1])
    data_coldstart.append({'train_set': train_set, 
                 'users_idx': dataset['users_idx'], 
                 'questions_idx': dataset['questions_idx'],
                 'attempts': max_attempt+1
                })

CPU times: user 12.4 s, sys: 328 ms, total: 12.7 s
Wall time: 7min 24s


In [7]:
len(data_coldstart)

10

### Organizing data

In [8]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    train_set = data['train_set']

    max_attempt = np.max(train_set[:,2])+1
#     max_attempt = 20
    student_performance_coldstart = np.zeros((N_STUDENTS, N_QUESTIONS, data['attempts']))
    X_coldstart = np.zeros(student_performance_coldstart.shape)

    student_performance_coldstart, X_coldstart = add_to_tensor(train_set, 
                                                               student_performance_coldstart, 
                                                               X_coldstart, 
                                                               filter_attempt=data['attempts']
                                                              )
    
    return student_performance_coldstart, X_coldstart

In [9]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [10]:
%%time
query = """select dataset, experiment_id, sp, x, sp_hat, sk_hat, q_matrix_hat, mu, del_questions
from EDM2020_2020_06_05 where method='fdtf' 
and attempts_train=150 and concepts >=11 and concepts <=16 and mu='0.1' and dataset <> 'run_all'
order by experiment_id"""
cursor.execute(query)
row = cursor.fetchone()

CPU times: user 4.55 s, sys: 5.67 s, total: 10.2 s
Wall time: 3min 26s


In [32]:
#%%time
total = 0
update_queries = []

while row:
    dataset, exp_id, sp, X, sp_hat, sk_hat, q_matrix_hat, mu, del_questions = row
    
    dataset_id = int(dataset.split('_')[-1])
        
    sp_coldstart, X_coldstart = transform_data(data_coldstart[dataset_id])
    
    X = np.asarray(X)
    sp = np.asarray(sp)
    sp_hat = np.asarray(sp_hat)
    sk_hat = np.asarray(sk_hat)
    q_matrix_hat = np.asarray(q_matrix_hat)
    
    # Delete unused questions
    X_coldstart = np.delete(X_coldstart, del_questions, axis=1)
    sp_coldstart = np.delete(sp_coldstart, del_questions, axis=1)
    
    # More attempts than train attempts
    q_matrix_hat = np.asarray(q_matrix_hat)
    sp_hat_coldstart = np.zeros((sp_hat.shape[0], sp_hat.shape[1], sp_coldstart.shape[2]))
    attempts_begin = min(sp_coldstart.shape[2], sp_hat.shape[2])
    
    ### CHOOSE ATTEMPT 
    # Not seen attempts
    attempts = X_coldstart.shape[2]
    # First 20 attempts
    #attempts = attempts_begin

    sp_hat_coldstart[:, :, :attempts_begin] = sp_hat[:, :, :attempts_begin]
    sk_hat_coldstart = np.zeros((sk_hat.shape[0], sk_hat.shape[1], sp_coldstart.shape[2]))
    sk_hat_coldstart[:, :, :attempts_begin] = sk_hat[:, :, :attempts_begin]
    
    
    # Calculate SK and SP for the next test attempts
    for attempt in range(1, attempts):
        students = np.where(X_coldstart[:,:, attempt] == 1)[0]
#         print("%d students in attempt %d" % (len(students), attempt))
        for student in students:
            sk_hat_coldstart[student, :, attempt] = (2*sk_hat_coldstart[student, :, attempt-1]) + \
                                            2*(1-sk_hat_coldstart[student, :, attempt-1])/(1+np.exp(
                                                -mu*np.dot(X_coldstart[student, :, attempt], 
                                                           q_matrix_hat.T))) - 1
            sp_hat_coldstart[student, :, attempt] = np.dot(sk_hat_coldstart[student, :, attempt], 
                                                           q_matrix_hat)
    
    # Get test predicted values
    y = sp_coldstart[np.where(X_coldstart[:,:,:attempt+1] == 1)]
    y_pred = sp_hat_coldstart[np.where(X_coldstart[:,:,:attempt+1] == 1)]
    rmse = np.sqrt(np.mean(np.power(y - y_pred, 2)))
    total += 1
    
    update = {
#        "train_rmse_cs_att": rmse,
         "train_rmse_cs_all": rmse,
         "max_train_cs_att": attempts
    }
    
    # Write PSQL query
    update_query = "UPDATE EDM2020_2020_06_05 SET "
    update_list = []
    query_values = []
    for key, value in update.items():
        update_list.append(key + "= %s")
        query_values.append(value)
    
    update_query += ", ".join(update_list) + "where experiment_id = %s"
    query_values.append(exp_id)
    query_values = tuple(query_values)
    query = cursor.mogrify(update_query, query_values)
    update_queries.append(query)
    
    print("Dataset %s" % dataset)
    print("Train: %d " % y.shape[0])
    print("Train all: %d" % np.where(X == 1)[0].shape[0])
        
    try:
        row = cursor.fetchone()
    except psycopg2.ProgrammingError:
        row = False

Dataset 2020_06_08_run_0
Train: 1989 
Train all: 3017
Dataset 2020_06_08_run_0
Train: 1989 
Train all: 3017
Dataset 2020_06_08_run_0
Train: 1989 
Train all: 3017
Dataset 2020_06_08_run_0
Train: 1989 
Train all: 3017
Dataset 2020_06_08_run_0
Train: 1989 
Train all: 3017
Dataset 2020_06_08_run_1
Train: 1935 
Train all: 2900
Dataset 2020_06_08_run_1
Train: 1935 
Train all: 2900
Dataset 2020_06_08_run_1
Train: 1935 
Train all: 2900
Dataset 2020_06_08_run_1
Train: 1935 
Train all: 2900
Dataset 2020_06_08_run_1
Train: 1935 
Train all: 2900
Dataset 2020_06_08_run_2
Train: 2029 
Train all: 2994
Dataset 2020_06_08_run_2
Train: 2029 
Train all: 2994
Dataset 2020_06_08_run_2
Train: 2029 
Train all: 2994
Dataset 2020_06_08_run_2
Train: 2029 
Train all: 2994
Dataset 2020_06_08_run_2
Train: 2029 
Train all: 2994
Dataset 2020_06_08_run_3
Train: 1750 
Train all: 3023
Dataset 2020_06_08_run_3
Train: 1750 
Train all: 3023
Dataset 2020_06_08_run_3
Train: 1750 
Train all: 3023
Dataset 2020_06_08_run_3
Tra

In [33]:
total

35

In [34]:
%%time
for query in update_queries:
    cursor.execute(query)

CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 4.55 s
