In [1]:
# Helpers
from collections import Counter, defaultdict
import numpy as np
from itertools import chain
import pickle

#DB
from django.db.models import Case, IntegerField, Value

# Plot
import matplotlib.pyplot as plt
import seaborn as sns

# DB 
import psycopg2
from django.conf import settings

In [2]:
folder = "data/edm2020/"

### Get student data

In [3]:
# Filter used problems and solutions
last_id = 132
problems = Problem.objects.filter(id__lte=last_id)

# Get all students
user_blacklist = UserProfile.objects.filter(professor__user__username='sem_professor')

attempts = UserLog.objects.exclude(outcome='S').filter(
    timestamp__lt=OuterRef('timestamp'), user__id=OuterRef('user__id')).annotate(
    attempt=Count('*')).values('attempt')

# Remove group by values
attempts.query.set_group_by()

# Get users
users = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').exclude(
    user__userprofile__in=user_blacklist).annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0.1)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score"#"outcome", "timestamp"
).order_by("timestamp").filter(attempt=1).values_list('user_id', flat=True)

data = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score", #"outcome", #"timestamp"
).order_by("timestamp").filter(user__in=users)

Create train set after 20% of students attempts to compare cold-start problem

In [4]:
N_RUNS = 5
TRAIN_PROPORTION = 0.8
TRAIN_ATTEMPTS_TEST = 0.2

In [5]:
def get_user_idx(user_id):
    return users_idx.index(user_id)

def get_question_idx(question_id):
    return questions_idx.index(question_id)

map_user_id = np.vectorize(get_user_idx)    
map_question_id = np.vectorize(get_question_idx)

In [None]:
data_coldstart = []
for i in range(N_RUNS):
    # Save data
    with open("%s/run_%d.pkl" % (folder, i), "rb") as pklfile:
        dataset = pickle.load(pklfile)
    test_users = []
    # Remove test users
    for row in dataset['test_set']:
        test_users.append(row[0])
    train_users = set(range(len(dataset['users_idx']))) - set(test_users)
    
    # Create "train set" to test with students only after up to 20% of their attempts
    # In the same way the test users were chosen
    train_set = []
    for idx, user in enumerate(train_users):
        user_attempts = data.filter(user_id=dataset['users_idx'][user])
        train_attempts = int(np.ceil(TRAIN_ATTEMPTS_TEST*user_attempts.count()))
        train_set.extend(list(user_attempts.filter(attempt__gt=train_attempts).order_by('attempt')))
        
    # Mapping users and questions to be in 0-len index
    users_idx = dataset['users_idx']
    questions_idx = dataset['questions_idx']
    train_set = np.asarray(train_set)
    train_set[:,0] = map_user_id(train_set[:,0])
    train_set[:,1] = map_question_id(train_set[:,1])
    data_coldstart.append({'train_set': train_set, 
                 'users_idx': dataset['users_idx'], 
                 'questions_idx': dataset['questions_idx']
                })

In [None]:
len(data_coldstart)

### Organizing data

In [None]:
def add_to_tensor(data, tensor, X, filter_attempt=False):
    for item in data:
        s_idx, q_idx, a_idx, outcome = item

        # If attempt is over max value, ignore it
        if filter_attempt and a_idx >= filter_attempt:
            continue

        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
        
    tensor[np.where(X[:,:] == 0)] = None
    return tensor, X
    
def transform_data(data):
    N_STUDENTS = len(data['users_idx'])
    N_QUESTIONS = len(data['questions_idx'])
    train_set = data['train_set']
    
    max_attempt = np.max(train_set[:,2])+1
#     max_attempt = 20
    student_performance_coldstart = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X_coldstart = np.zeros(student_performance_coldstart.shape)

    student_performance_coldstart, X_coldstart = add_to_tensor(train_set, 
                                                               student_performance_coldstart, 
                                                               X_coldstart, 
#                                                                filter_attempt=max_attempt
                                                              )
    
    return student_performance_coldstart, X_coldstart

In [None]:
connection = psycopg2.connect(user = settings.DATABASES["default"]["USER"],
                                  password = settings.DATABASES["default"]["PASSWORD"],
                                  host = settings.DATABASES["default"]["HOST"],
                                  port = settings.DATABASES["default"]["PORT"],
                                  database = settings.DATABASES["default"]["NAME"])
connection.autocommit=True
cursor = connection.cursor()

In [None]:
%%time
query = """select dataset, experiment_id, sp, x, sp_hat, sk_hat, q_matrix_hat, mu 
from EDM2020_2020_02_19 where experiment_id = 86677"""
cursor.execute(query)
row = cursor.fetchone()

In [None]:
total = 0
update_queries = []

while row:
    dataset, exp_id, sp, X, sp_hat, sk_hat, q_matrix_hat, mu = row
    
    dataset_id = int(dataset.split('_')[1])
        
    sp_coldstart, X_coldstart = transform_data(data_coldstart[dataset_id])
    
    X = np.asarray(X)
    attempts = X.shape[2]
    
    students_train_len = []
    students_train_cs_len = []
    for attempt in range(1, attempts):
        students_train = np.where(X[:,:, attempt] == 1)[0]
        students_train_cs = np.where(X_coldstart[:,:, attempt] == 1)[0]
        students_train_len.append(len(students_train))
        students_train_cs_len.append(len(students_train_cs))
    
    update = {
        ''
    }
    
    update_queries.append(query)
        
    try:
        row = cursor.fetchone()
    except psycopg2.ProgrammingError:
        row = False

In [None]:
total

In [None]:
%%time
for query in update_queries:
    cursor.execute(query)