# Fitting BKT to students

Criar a seguinte estrutura de dados:
1. ID do aluno
2. ID do problema
3. Lista com soluções 1 a N
6. Lista com conceito mais predominante nas soluções 1 a N
5. Lista indicando se resolveu em 1 a N soluções


### Import libraries

In [2]:
#DB
from questions.models import UserLog
from retrieve_model_and_vectorizer import RetrieveModelAndVectorizer
import psycopg2

# Helpers
import numpy as np
import pickle
import base64
from tqdm import tqdm_notebook as tqdm
import pandas as pd

### Recreate model and vectorizer

In [2]:
exp_id = 26
vectorizer, model = RetrieveModelAndVectorizer().get_model_and_vectorizer(exp_id)

Problems to be ignored: 591
Problems to be used: 132
Solutions to be used: 54
Got 54 documents




### Retrieve users' solutions and practiced skills

In [7]:
def transform(student, model_db, vectorize):
    """ Transform a new observation using the given vectorizer and model """
    train_data_features_student = vectorizer.transform(student).toarray()
    y_student = model_db.transform(train_data_features_student)
    return y_student

def get_skill(student_solution, vectorizer, model_db):
    """ Get argmax topic after transforming a new observation """
    y_student = transform(student_solution, model_db, vectorizer)
    top_skill = np.argmax(y_student, axis=1)
    return top_skill

In [8]:
user_blacklist = UserProfile.objects.filter(professor='')
logs_list = UserLog.objects.exclude(outcome='S').exclude(user__userprofile__in=user_blacklist).order_by("timestamp")
solution_list = logs_list.values_list('solution', flat=True)
skills = get_skill(solution_list, vectorizer, model)

### Put data in proper format

In [9]:
%timeit
data = {}
for idx, log in enumerate(tqdm(logs_list)):
    # Data identification is a tuple containing student id and problem id
    student_id = log.user.pk
    problem_id = log.problem.pk
    data_id = (student_id, problem_id)
    
    # If there isn't anything concerning this id, add it to dict
    if not data_id in data.keys():
        data[data_id] = {
            "student_id": student_id,
            "problem_id": problem_id,
            "solutions": [log.solution],
            "outcomes": [log.outcome],
            "skills": [skills[idx]]
        }
    # If basic information is already there, just complete with solution info
    else:
        data[data_id]["solutions"].append(log.solution)
        data[data_id]["outcomes"].append(log.outcome)
        data[data_id]["skills"].append(skills[idx])

HBox(children=(IntProgress(value=0, max=2746), HTML(value='')))




In [10]:
df = pd.DataFrame(data.values())

In [11]:
# Calculate if student learned in the N prediction
def success(row, N):
    if "P" in row["outcomes"][:N]:
        row["solved_in_%d" % N] = 1 # For correct
    else:
        row["solved_in_%d" % N] = 2 # For incorrect
    skill_str = [str(s) for s in set(row["skills"][:N])]
    row["skill_in_%d" % N] = "~".join(skill_str)
    return row

In [12]:
df = df.apply(success, args=(1,), axis=1).apply(success, args=(2,), axis=1).apply(success, args=(3,), axis=1)

In [3]:
with open("transaction_data.pkl", "rb") as pkl_file:
    df = pickle.load(pkl_file)

In [4]:
df

Unnamed: 0,outcomes,problem_id,skills,solutions,student_id,solved_in_1,skill_in_1,solved_in_2,skill_in_2,solved_in_3,skill_in_3
0,"[F, F, P]",118,"[5, 5, 5]",[# Complete your function using this header\nd...,47,2,5,2,5,1,5
1,"[F, P]",118,"[5, 5]",[# Complete your function using this header\nd...,48,2,5,1,5,1,5
2,[P],118,[5],[# Complete your function using this header\nd...,51,1,5,1,5,1,5
3,[P],129,[5],[# Complete your function using this header\nd...,54,1,5,1,5,1,5
4,"[F, F, F, F, F, F, F, F]",118,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,50,2,5,2,5,2,5
5,"[F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, ...",120,"[11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...",[# Complete your function using this header\nd...,51,2,11,2,11,2,11
6,[P],28,[5],[#Start your python function here\ndef max_of_...,56,1,5,1,5,1,5
7,"[F, F, P]",120,"[9, 11, 11]",[# Complete your function using this header\nd...,47,2,9,2,9~11,1,9~11
8,"[F, F, F, F, F, F, F]",120,"[11, 11, 11, 11, 9, 9, 9]",[# Complete your function using this header\nd...,50,2,11,2,11,2,11
9,"[F, F, F, F, F, F, F, F, F, P]",11,"[7, 7, 7, 7, 11, 11, 11, 11, 11, 11]",[# Complete your function using this header\nd...,56,2,7,2,7,2,7


In [5]:
from collections import defaultdict

In [6]:
%timeit
N = 3
SKILL = 5

df_train = pd.DataFrame()
df_test = pd.DataFrame()
outcome_states = {"P": 1, "F": 2}
count_student = defaultdict(int)

for idx, row in df.iterrows():
    # Only get problems with current skill
    if SKILL not in row["skills"]:
        continue
        
    # Count how many problems that student has done
    nP = count_student[row["student_id"]]
    
    # Get the first 3 problems that a student tries to solve
    if nP < 3:
        count_student[row["student_id"]] += 1
        
        # Copy row of 1st attempt
        append_row = row.copy(deep=True)
        append_row["solved"] = row["solved_in_1"]
#         append_row["skill"] = row["skills"][0]
        append_row["skill"] = SKILL
        df_train = df_train.append(append_row)

        # If 1st attempt was not successful, create row for 2nd attempt
        if row["solved_in_1"] == 2 and len(row["outcomes"]) >= 2:
            append_row = row.copy(deep=True)
            append_row["solved"] = row["solved_in_2"]
#             append_row["skill"] = row["skills"][1]
            append_row["skill"] = SKILL
            df_train = df_train.append(append_row)

        # If 2nd attempt was not successful, create row for 3rd attempt
        if row["solved_in_2"] == 2 and len(row["outcomes"]) >= 3:
            append_row = row.copy(deep=True)
            append_row["solved"] = row["solved_in_3"]
#             append_row["skill"] = row["skills"][2]
            append_row["skill"] = SKILL
            df_train = df_train.append(append_row)

        # If 3rd attempt was not successful, create row for last attempt
        if row["solved_in_3"] == 2 and len(row["outcomes"]) >= 4:
            append_row = row.copy(deep=True)
            append_row["solved"] = outcome_states[row["outcomes"][-1]]
#             append_row["skill"] = row["skills"][-1]
            append_row["skill"] = SKILL
            df_train = df_train.append(append_row)
            
    # Get student's next problem
    elif nP == 3:
        count_student[row["student_id"]] += 1
        df_test = df_test.append(row)

In [7]:
df_train

Unnamed: 0,outcomes,problem_id,skill,skill_in_1,skill_in_2,skill_in_3,skills,solutions,solved,solved_in_1,solved_in_2,solved_in_3,student_id
0,"[F, F, P]",118.0,5.0,5,5,5,"[5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,1.0,47.0
0,"[F, F, P]",118.0,5.0,5,5,5,"[5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,1.0,47.0
0,"[F, F, P]",118.0,5.0,5,5,5,"[5, 5, 5]",[# Complete your function using this header\nd...,1.0,2.0,2.0,1.0,47.0
1,"[F, P]",118.0,5.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,2.0,2.0,1.0,1.0,48.0
1,"[F, P]",118.0,5.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,1.0,2.0,1.0,1.0,48.0
2,[P],118.0,5.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,1.0,51.0
3,[P],129.0,5.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,1.0,54.0
4,"[F, F, F, F, F, F, F, F]",118.0,5.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,50.0
4,"[F, F, F, F, F, F, F, F]",118.0,5.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,50.0
4,"[F, F, F, F, F, F, F, F]",118.0,5.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,50.0


In [11]:
student_train = df_train['student_id'].unique()
seq_list = []
for student in student_train:
    seq_df = df_train[df_train["student_id"] == student]
    seq_list.append(seq_df["solved"].tolist())
    
with open("train_skill_5.pkl", "wb") as pkl_file:
    pickle.dump(seq_list, pkl_file, 2)

In [12]:
student_test = df_test['student_id'].unique()
seq_list = []
for student in student_test:
    seq_train = df_train[df_train["student_id"] == student]["solved"].tolist()
    seq_test = df_test[df_test["student_id"] == student]["solved_in_1"].tolist()
    seq = seq_train + seq_test
    seq_list.append(seq)
    
with open("test_skill_5.pkl", "wb") as pkl_file:
    pickle.dump(seq_list, pkl_file, 2)

In [10]:
seq_list

[[1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0],
 [1.0, 1.0, 2.0, 2.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
 [2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0],
 [1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0],
 [2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0],
 [2.0, 2.0, 2.0, 1.0, 1.0, 1.0],
 [2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0],
 [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0],
 [2.0, 2.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 1.0, 2.0, 2.0],
 [1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0],
 [1.0, 2.0, 2.0, 2.0, 1.0, 1.0, 2.0]]

In [34]:
df_train[df_train["student_id"] == 56]

Unnamed: 0,outcomes,problem_id,skill,skill_in_1,skill_in_2,skill_in_3,skills,solutions,solved,solved_in_1,solved_in_2,solved_in_3,student_id
6,[P],28.0,5.0,5,5,5,[5],[#Start your python function here\ndef max_of_...,1.0,1.0,1.0,1.0,56.0
11,"[F, F, F, F, P]",115.0,5.0,5,11~5,11~5,"[5, 11, 11, 11, 11]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,56.0
11,"[F, F, F, F, P]",115.0,5.0,5,11~5,11~5,"[5, 11, 11, 11, 11]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,56.0
11,"[F, F, F, F, P]",115.0,5.0,5,11~5,11~5,"[5, 11, 11, 11, 11]",[# Complete your function using this header\nd...,2.0,2.0,2.0,2.0,56.0
11,"[F, F, F, F, P]",115.0,5.0,5,11~5,11~5,"[5, 11, 11, 11, 11]",[# Complete your function using this header\nd...,1.0,2.0,2.0,2.0,56.0
30,[P],126.0,5.0,5,5,5,[5],[# Complete your function using this header\n#...,1.0,1.0,1.0,1.0,56.0


In [117]:
df_test

Unnamed: 0,outcomes,problem_id,skill_in_1,skill_in_2,skill_in_3,skills,solutions,solved_in_1,solved_in_2,solved_in_3,student_id
42,"[P, F]",41.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,1.0,1.0,1.0,56.0
48,[F],7.0,5,5,5,[5],[# Complete your function using this header\nd...,2.0,2.0,2.0,50.0
61,"[F, F, P]",76.0,5,5,11~5,"[5, 5, 11]",[# Complete your function using this header\nd...,2.0,2.0,1.0,60.0
123,"[F, F, F, F, F]",117.0,5,5,5,"[5, 5, 5, 5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,92.0
192,"[F, F]",49.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,2.0,2.0,2.0,102.0
216,"[F, P]",131.0,5,5,5,"[5, 5]",[# Complete your function using this header\nd...,2.0,1.0,1.0,99.0
219,[P],131.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,112.0
226,[P],130.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,121.0
235,[P],41.0,5,5,5,[5],[# Complete your function using this header\nd...,1.0,1.0,1.0,55.0
248,"[F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, P]",117.0,5,5,5,"[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 9]",[# Complete your function using this header\nd...,2.0,2.0,2.0,64.0


In [138]:
filename = "train.csv"
df_train[["solved", "student_id", "problem_id", "skill"]].to_csv(filename, sep='\t', header=False, index=False)

filename = "test.csv"
df_test[["solved_in_1", "student_id", "problem_id", "skill_in_1"]].to_csv(filename, sep='\t', header=False, index=False)

In [91]:
allowed_skills = [5, 11, 9, 7]
for s in allowed_skills:
    filename = "data_last_attempt_skill_%d.csv" % s
    data_last_attempt_skill = df_train[df_train["skill"] == s][["solved", "student_id", "problem_id", "skill"]]
    data_last_attempt_skill.to_csv(filename, sep='\t', header=False, index=False)
    print("Data for skill %d" % s)
    print(data_last_attempt_skill.groupby("solved").count()["student_id"]/data_last_attempt_skill.groupby("solved").count()["student_id"].sum())

Data for skill 5
solved
1.0    0.375
2.0    0.625
Name: student_id, dtype: float64
Data for skill 11
solved
1.0    0.324324
2.0    0.675676
Name: student_id, dtype: float64
Data for skill 9
solved
1.0    0.39823
2.0    0.60177
Name: student_id, dtype: float64
Data for skill 7
solved
1.0    0.379475
2.0    0.620525
Name: student_id, dtype: float64


In [16]:
def viterbi(obs, states, start_p, trans_p, emit_p):

    V = [{}]

    for st in states:

        V[0][st] = {"prob": start_p[st] * emit_p[st][obs[0]], "prev": None}

    # Run Viterbi when t > 0

    for t in range(1, len(obs)):

        V.append({})

        for st in states:

            max_tr_prob = V[t-1][states[0]]["prob"]*trans_p[states[0]][st]

            prev_st_selected = states[0]

            for prev_st in states[1:]:

                tr_prob = V[t-1][prev_st]["prob"]*trans_p[prev_st][st]

                if tr_prob > max_tr_prob:

                    max_tr_prob = tr_prob

                    prev_st_selected = prev_st

                    

            max_prob = max_tr_prob * emit_p[st][obs[t]]

            V[t][st] = {"prob": max_prob, "prev": prev_st_selected}

                    

    for line in dptable(V):

        print (line)

    opt = []

    # The highest probability

    max_prob = max(value["prob"] for value in V[-1].values())

    previous = None

    # Get most probable state and its backtrack

    for st, data in V[-1].items():

        if data["prob"] == max_prob:

            opt.append(st)

            previous = st

            break

    # Follow the backtrack till the first observation

    for t in range(len(V) - 2, -1, -1):

        opt.insert(0, V[t + 1][previous]["prev"])

        previous = V[t + 1][previous]["prev"]


    print ('The steps of states are ' + ' '.join(opt) + ' with highest probability of %s' % max_prob)


def dptable(V):

    # Print a table of steps from dictionary

    yield " ".join(("%12d" % i) for i in range(len(V)))

    for state in V[0]:

        yield "%.7s: " % state + " ".join("%.7s" % ("%f" % v[state]["prob"]) for v in V)


In [19]:
obs = (1,2,2,2,1,1,2)
states = ('L', '~L')
start_p = {'L': 0.23, '~L': 0.77}
trans_p = {
   'L' : {'L': 1, '~L': 0},
   '~L' : {'L': 0.14, '~L': 0.86}
   }
emit_p = {
   'L' : {1: 0.42, 2: 0.58},
   '~L' : {1: 0.31, 2: 0.69}
   }

In [20]:
viterbi(obs,
        states,
        start_p,
        trans_p,
        emit_p)

           0            1            2            3            4            5            6
L: 0.09660 0.05602 0.03249 0.01884 0.00791 0.00332 0.00192
~L: 0.23870 0.14164 0.08405 0.04987 0.01329 0.00354 0.00210
The steps of states are ~L ~L ~L ~L ~L ~L ~L with highest probability of 0.002103598619271065
