In [1]:
import pickle
import json
import numpy as np
from collections import defaultdict
import warnings

#DB
from django.db.models import Case, IntegerField, Value

In [2]:
folder = "data/relations/"

### Dataset 1: Students x Questions (Avg(attempts))
Students from the random/EER experiment with 48 different problems

#### Escolhas de modelagem
- Skip não está contando nem como tentativa
- Número de tentativas em uma questão para tirar a média: todas, 1, 2, 3?

In [3]:
# Filter used problems and solutions
last_id = 132
problems = Problem.objects.filter(id__lte=last_id)
solutions = Solution.objects.filter(problem__in=problems, ignore=False).values_list('problem', flat=True)
problems = solutions.order_by('problem').values_list(
    'problem', flat=True).distinct('problem')
print("Total number of questions: %d" % problems.count())
questions_idx = list(problems.order_by('problem_id'))
N_QUESTIONS = problems.count()

Total number of questions: 48


In [4]:
# Get all students
user_blacklist = UserProfile.objects.filter(professor__user__username='sem_professor')

attempts = UserLog.objects.exclude(outcome='S').filter(
    timestamp__lt=OuterRef('timestamp'), user__id=OuterRef('user__id')).annotate(
    attempt=Count('*')).values('attempt')

# Remove group by values
attempts.query.set_group_by()

# Get users
users = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').exclude(
    user__userprofile__in=user_blacklist).annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0.1)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score"#"outcome", "timestamp"
).order_by("timestamp").filter(attempt=1).values_list('user_id', flat=True)

data = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').annotate(
    attempt=Subquery(attempts, output_field=IntegerField())).annotate(
    score=Case(
        When(outcome='F', then=Value(0)),
        When(outcome='P', then=Value(1)),
        output_field=IntegerField())).values_list(
    "user__id", "problem_id", "attempt", "score", #"outcome", #"timestamp"
).order_by("timestamp").filter(user__in=users)

users_idx = data.values('user__id').distinct().order_by()
N_STUDENTS = users_idx.count()
users_idx = list(users_idx.values_list('user__id', flat=True))
data.count()

3632

In [5]:
def get_user_idx(user_id):
    return users_idx.index(user_id)

def get_question_idx(question_id):
    return questions_idx.index(question_id)

def separate_users_train_and_test(users_idx):
    np.random.shuffle(users_idx)
    idx = int(np.ceil(TRAIN_PROPORTION*len(users)))
    train_users = users_idx[:idx]
    test_users = users_idx[idx:]
    return train_users, test_users

map_user_id = np.vectorize(get_user_idx)    
map_question_id = np.vectorize(get_question_idx)

In [6]:
data = np.array(data)
data[:,0] = map_user_id(data[:,0])
data[:,1] = map_question_id(data[:,1])

In [7]:
dataset = {}
dataset['data'] = np.array(data)
dataset['users_idx'] = np.array(users_idx)
dataset['questions_idx'] = np.array(questions_idx)

In [10]:
def add_to_matrix(data, tensor, X, attempt_agg='avg', filter_attempt=False):
    attempts_count = defaultdict(int)
    for item in data:
        s_idx, q_idx, a_idx, outcome = item
        
        # If attempt filter is provided
        if filter_attempt and attempts_count[(s_idx, q_idx)] >= filter_attempt:
            continue
        else:
            attempts_count[(s_idx, q_idx)] += 1
        
        tensor[s_idx, q_idx, a_idx] = outcome
        X[s_idx, q_idx, a_idx] = 1
    tensor[np.where(X[:,:] == 0)] = None

    # # Average over all attempts for one question
    if attempt_agg == 'avg':
        # Used to ignore NaN warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            matrix = np.nanmean(tensor, axis=2)
            # 0.1 means the student at least tried opposing to 0, which is the he/she did not attempt the question
            matrix[np.where(matrix < 0.1)] = 0.1
        matrix[np.where(np.isnan(matrix) == True)] = 0
    return X, tensor, matrix

def transform_data(data, n_attempts):
    max_attempt = np.max(data[:,2])+1
    student_performance_tensor = np.zeros((N_STUDENTS, N_QUESTIONS, max_attempt))
    X = np.zeros(student_performance_tensor.shape)

    X, t, student_performance = add_to_matrix(data, 
                                        student_performance_tensor, 
                                        X, filter_attempt=n_attempts)
    
    return X, t, student_performance

#### Utilizando todas as tentativas

In [11]:
# Modifique esse valor para calcular a média entre as n primeiras tentativas
n_attempts = False

In [12]:
X, t, student_performance = transform_data(dataset['data'], n_attempts)
# Check if questions have minimum number of attempts
attempts_per_question = student_performance.sum(axis=0)           
del_questions = [idx for idx,value in enumerate(attempts_per_question) if value == 0]
#             print(del_questions)
student_performance = np.delete(student_performance, del_questions, axis=1)
student_performance.shape

(197, 48)

In [13]:
dataset['student_performance'] = student_performance
sparsity = 1 - np.where(student_performance > 0)[0].shape/np.prod(student_performance.shape)
sparsity

array([0.90778342])

In [15]:
student_performance[:20,:]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.33333333, 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.25      , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.1       , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.33333333, 1.        , 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
   

### Dataset 2: Terms x Question
Terms from each solution for the experiment with 48 different problems

In [16]:
# Load data
with open('data/tese/train_data_features.pkl', 'rb') as pklfile:
    train_data_features = pickle.load(pklfile)
    
# Load questions ids
with open('data/tese/questions_idx.pkl', 'rb') as pklfile:
    terms_questions_idx = pickle.load(pklfile)

In [17]:
not_dup =[]
dup_idx = []
for idx, item in enumerate(terms_questions_idx):
    if item not in not_dup:
        not_dup.append(item)
    else:
        dup_idx.append(idx)

In [18]:
terms = np.delete(train_data_features, [dup_idx], axis=0)
terms.shape

(48, 236)

Make sure questions_idx is the same for both

In [25]:
a = np.array(terms_questions_idx)
np.all(np.delete(a, [dup_idx]) == questions_idx)

True

In [36]:
dataset['terms'] = terms.T
sparsity = 1 - np.where(terms > 0)[0].shape/np.prod(terms.shape)
sparsity

array([0.84860523])

In [33]:
# Save data
with open("%s/dataset_relations.pkl" % (folder), "wb") as pklfile:
    pickle.dump(dataset, pklfile)

In [34]:
# Load data
with open("%s/dataset_relations.pkl" % (folder), "rb") as pklfile:
    d = pickle.load(pklfile)

In [35]:
d['terms'].shape

(236, 48)