# Fitting BKT to students

Criar a seguinte estrutura de dados:
1. ID do aluno
2. ID do problema
3. Lista com soluções 1 a N
6. Lista com conceito mais predominante nas soluções 1 a N
5. Lista indicando se resolveu em 1 a N soluções


### Import libraries

In [1]:
#DB
from questions.models import UserLog, Problem
from retrieve_model_and_vectorizer import RetrieveModelAndVectorizer
import psycopg2
from spkit import bkt

# Helpers
import numpy as np
import pickle
import base64
from tqdm import tqdm_notebook as tqdm
import pandas as pd
from collections import defaultdict

### Recreate model and vectorizer

In [2]:
exp_id = 26
vectorizer, model = RetrieveModelAndVectorizer().get_model_and_vectorizer(exp_id)

Problems to be ignored: 644
Problems to be used: 132
Solutions to be used: 54
Got 54 documents


### Retrieve users' solutions and practiced skills

In [3]:
def transform(student, model_db, vectorize):
    """ Transform a new observation using the given vectorizer and model """
    train_data_features_student = vectorizer.transform(student).toarray()
    y_student = model_db.transform(train_data_features_student)
    return y_student

def get_skill(student_solution, vectorizer, model_db):
    """ Get argmax topic after transforming a new observation """
    y_student = transform(student_solution, model_db, vectorizer)
    top_skill = np.argmax(y_student, axis=1)
    return top_skill

In [4]:
folder = "data/comp1_2019_2"

In [5]:
%%time
# problems = Problem.objects.filter(chapter__isnull=False)
problems = Problem.objects.filter(chapter__pk=8)
# user_blacklist = UserProfile.objects.filter(professor__user__username='sem_professor')
user_blacklist = ['lmoraes', 'matos.gabriel26@gmail.com', 'angeloacrdaumas@outlook.com', 'giuliaelvira23@poli.ufrj.br',
                  'hugo', 'gracepassosfreitas@gmail.com', 'carla']
user_blacklist = UserProfile.objects.filter(user__username__in=user_blacklist)
logs_list = UserLog.objects.filter(problem__in=problems).exclude(outcome='S').exclude(user__userprofile__in=user_blacklist).order_by("timestamp")
solution_list = logs_list.values_list('solution', flat=True)
skills = get_skill(solution_list, vectorizer, model)

CPU times: user 596 ms, sys: 0 ns, total: 596 ms
Wall time: 1.73 s


In [6]:
logs_list.count()

876

### Put data in proper format

In [None]:
%%time
data = {}
for idx, log in enumerate(tqdm(logs_list)):
    # Data identification is a tuple containing student id and problem id
    student_id = log.user.pk
    problem_id = log.problem.pk
    data_id = (student_id, problem_id)
    
    # If there isn't anything concerning this id, add it to dict
    if not data_id in data.keys():
        data[data_id] = {
            "identifier": "%d-%d" % (student_id, problem_id),
            "student_id": student_id,
            "problem_id": problem_id,
            "solutions": [log.solution],
            "outcomes": [log.outcome],
            "skills": [skills[idx]]
        }
    # If basic information is already there, just complete with solution info
    else:
        data[data_id]["solutions"].append(log.solution)
        data[data_id]["outcomes"].append(log.outcome)
        data[data_id]["skills"].append(skills[idx])

HBox(children=(IntProgress(value=0, max=876), HTML(value='')))

In [None]:
df = pd.DataFrame(data.values())

In [None]:
df

In [None]:
with open("%s/data_chapter_8_2019_10_14.pkl" % folder, "wb") as pkl_file:
    pickle.dump(data, pkl_file)

In [None]:
# Calculate if student learned in the N prediction
def success(row, N):
    if "P" in row["outcomes"][:N]:
        row["solved_in_%d" % N] = 1 # For correct
    else:
        row["solved_in_%d" % N] = 0 # For incorrect
#     skill_str = [str(s) for s in set(row["skills"][:N])]
    row["skill_in_%d" % N] = list(set(row["skills"][:N]))
    return row

In [None]:
%%time
df = df.apply(success, args=(1,), axis=1).apply(success, args=(2,), axis=1).apply(success, args=(3,), axis=1)

In [None]:
with open("%s/data_df_chapter_8_2019_10_14.pkl" % folder, "wb") as pkl_file:
    pickle.dump(df, pkl_file)

In [None]:
df

Solved in 1 attempt

In [None]:
%%time
kcs = []
for row in df['skill_in_1'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['identifier'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['identifier'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

In [None]:
%%time
q_matrix = np.zeros((len(steps), len(kcs)))

for idx, item in df.iterrows():
    for kc in item['skill_in_1']:
        kc_col = kcs.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

In [None]:
with open('%s/q_matrix_chapter_8_2019_10_14.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

In [None]:
%%time
# Data matrix
data = df[['solved_in_1', 'student_id', 'question_id']].values.tolist()

# Instantiate model
model = bkt.BKT()
model.fit(data, q_matrix)

In [39]:
%timeit
N = 5
SKILL = 5

df_train = pd.DataFrame()
df_test = pd.DataFrame()
outcome_states = {"P": 1, "F": 2}
count_student = defaultdict(int)

for idx, row in df.iterrows():
    # Only get problems with current skill
    if SKILL not in row["skills"]:
        continue
        
    # Count how many problems that student has done
    nP = count_student[row["student_id"]]
    
    # Get the first 3 problems that a student tries to solve
    if nP < N:
        count_student[row["student_id"]] += 1
        
        # Copy row of 1st attempt
        append_row = row.copy(deep=True)
        append_row["solved"] = row["solved_in_1"]
#         append_row["skill"] = row["skills"][0]
        append_row["skill"] = SKILL
        df_train = df_train.append(append_row)

#         # If 1st attempt was not successful, create row for 2nd attempt
#         if row["solved_in_1"] == 2 and len(row["outcomes"]) >= 2:
#             append_row = row.copy(deep=True)
#             append_row["solved"] = row["solved_in_2"]
# #             append_row["skill"] = row["skills"][1]
#             append_row["skill"] = SKILL
#             df_train = df_train.append(append_row)

#         # If 2nd attempt was not successful, create row for 3rd attempt
#         if row["solved_in_2"] == 2 and len(row["outcomes"]) >= 3:
#             append_row = row.copy(deep=True)
#             append_row["solved"] = row["solved_in_3"]
# #             append_row["skill"] = row["skills"][2]
#             append_row["skill"] = SKILL
#             df_train = df_train.append(append_row)

#         # If 3rd attempt was not successful, create row for last attempt
#         if row["solved_in_3"] == 2 and len(row["outcomes"]) >= 4:
#             append_row = row.copy(deep=True)
#             append_row["solved"] = outcome_states[row["outcomes"][-1]]
# #             append_row["skill"] = row["skills"][-1]
#             append_row["skill"] = SKILL
#             df_train = df_train.append(append_row)
            
    # Get student's next problem
    elif nP >= N:
        count_student[row["student_id"]] += 1
        df_test = df_test.append(row)

In [31]:
df_train

Unnamed: 0,outcomes,problem_id,skill,skill_in_1,skill_in_2,skill_in_3,skills,solutions,solved,solved_in_1,solved_in_2,solved_in_3,student_id
0,"[F, F, P]",118.0,5.0,5,5,5,"[5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,1.0,47.0
1,"[F, P]",118.0,5.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,2.0,2.0,1.0,1.0,48.0
2,[P],118.0,5.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,1.0,51.0
3,[P],129.0,5.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,1.0,54.0
4,"[F, F, F, F, F, F, F, F]",118.0,5.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,50.0
6,[P],28.0,5.0,5,5,5,[5],[#Start your python function here\ndef max_of_...,1.0,1.0,1.0,1.0,56.0
11,"[F, F, F, F, P]",115.0,5.0,5,11~5,11~5,"[5, 11, 11, 11, 11]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,56.0
16,"[F, F, F, F, F, F, F, F, F, F, F, F, F, F]",116.0,5.0,5,5,9~5,"[5, 5, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,58.0
20,"[F, F, F]",76.0,5.0,5,5,5,"[5, 5, 5]",[#Start your python function here\ndef dec2bin...,2.0,2.0,2.0,2.0,55.0
22,"[F, F, F, F, F, F, F, P]",109.0,5.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\ni...,2.0,2.0,2.0,2.0,54.0


In [43]:
student_train = df_train['student_id'].unique()
seq_list = []
for student in student_train:
    seq_df = df_train[df_train["student_id"] == student]
    seq_list.append(seq_df["solved"].tolist())
    
with open("train_skill_5.pkl", "wb") as pkl_file:
    pickle.dump(seq_list, pkl_file, 2)

In [42]:
student_test = df_test['student_id'].unique()
seq_list = []
for student in student_test:
    seq_train = df_train[df_train["student_id"] == student]["solved_in_1"].tolist()
    seq_test = df_test[df_test["student_id"] == student]["solved_in_1"].tolist()
    seq = seq_train + seq_test
    seq_list.append(seq)
    
with open("test_skill_5.pkl", "wb") as pkl_file:
    pickle.dump(seq_list, pkl_file, 2)

In [44]:
seq_list

[[2.0, 2.0],
 [2.0],
 [1.0, 1.0],
 [1.0, 2.0, 2.0],
 [2.0, 1.0, 2.0, 2.0],
 [1.0, 2.0, 1.0, 2.0, 2.0],
 [2.0],
 [2.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 2.0],
 [2.0],
 [1.0, 1.0, 2.0, 2.0, 1.0],
 [1.0],
 [1.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 1.0],
 [2.0],
 [2.0],
 [1.0],
 [1.0, 2.0],
 [1.0],
 [1.0],
 [2.0, 2.0],
 [1.0],
 [2.0, 2.0],
 [2.0, 1.0],
 [2.0],
 [2.0, 2.0, 2.0, 2.0, 1.0],
 [1.0],
 [2.0, 2.0, 1.0, 1.0],
 [1.0],
 [2.0],
 [2.0, 2.0, 2.0],
 [2.0, 2.0, 1.0, 1.0, 2.0],
 [2.0, 2.0, 1.0, 2.0],
 [1.0, 2.0],
 [1.0],
 [1.0],
 [1.0, 2.0, 1.0, 2.0],
 [2.0, 2.0],
 [2.0, 2.0, 1.0, 1.0, 2.0],
 [2.0, 1.0],
 [1.0, 2.0],
 [2.0, 1.0],
 [2.0, 2.0, 2.0],
 [1.0, 2.0],
 [2.0, 2.0],
 [2.0],
 [2.0, 1.0],
 [2.0],
 [2.0],
 [1.0, 1.0, 1.0],
 [2.0, 2.0],
 [2.0],
 [1.0, 1.0, 2.0],
 [2.0, 2.0],
 [2.0],
 [2.0, 1.0, 1.0, 1.0, 1.0],
 [2.0, 2.0, 1.0, 2.0, 1.0],
 [1.0],
 [2.0, 1.0],
 [1.0, 2.0],
 [2.0, 2.0],
 [2.0],
 [1.0, 2.0],
 [2.0],
 [1.0],
 [1.0],
 [1.0],
 [2.0],
 [1.0],
 [1.0, 2.0],
 [1.0],
 [1.0, 2.0],
 [1.0],
 [1.0,

In [45]:
filename = "train.csv"
df_train[["solved", "student_id", "problem_id", "skill"]].to_csv(filename, sep='\t', header=False, index=False)

filename = "test.csv"
df_test[["solved_in_1", "student_id", "problem_id", "skill_in_1"]].to_csv(filename, sep='\t', header=False, index=False)

In [48]:
print(df_train.groupby("solved").count()["student_id"]/df_train.groupby("solved").count()["student_id"].sum())
print(df_test.groupby("solved_in_1").count()["student_id"]/df_test.groupby("solved_in_1").count()["student_id"].sum())

solved
1.0    0.418972
2.0    0.581028
Name: student_id, dtype: float64
solved_in_1
1.0    0.565217
2.0    0.434783
Name: student_id, dtype: float64
