In [14]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

### Reading KDD train file
Drop rows with null in any of the important columns

In [15]:
folder = 'data/kdd/2005-2006'

In [16]:
%%time
df_all = pd.read_csv('%s/algebra_2005_2006_train.txt' % folder, delimiter='\t')
print(df_all.shape)

(809694, 19)
CPU times: user 4.47 s, sys: 156 ms, total: 4.62 s
Wall time: 4.62 s


In [17]:
df_all

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,40.0,,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,16.0,16.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,36.0,,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2
3,4,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,-8y = -6,2005-09-09 12:26:12.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,22.0,22.0,,1,0,0,1,"[SkillRule: Remove coefficient; {ax+b=c, divid...",1~~1
4,5,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,2,-7y-5 = -4,2005-09-09 12:26:38.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,118.0,118.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",3~~1
5,6,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,2,-7y = 1,2005-09-09 12:28:36.0,2005-09-09 12:28:43.0,2005-09-09 12:28:51.0,2005-09-09 12:28:51.0,15.0,,15.0,0,1,0,1,"[SkillRule: Remove coefficient; {ax+b=c, divid...",2~~2
6,7,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,3,7y+4 = 7,2005-09-09 12:28:57.0,2005-09-09 12:29:09.0,2005-09-09 12:29:09.0,2005-09-09 12:29:09.0,12.0,12.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",4
7,8,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,3,7y = 3,2005-09-09 12:29:09.0,2005-09-09 12:29:14.0,2005-09-09 12:29:14.0,2005-09-09 12:29:14.0,5.0,5.0,,1,0,0,1,[SkillRule: Remove positive coefficient; {ax/b...,1~~3
8,9,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,4,-5+9y = -6,2005-09-09 12:29:19.0,2005-09-09 12:29:31.0,2005-09-09 12:29:31.0,2005-09-09 12:29:31.0,12.0,12.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",5~~2
9,10,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,4,9y = -1,2005-09-09 12:29:31.0,2005-09-09 12:29:36.0,2005-09-09 12:29:36.0,2005-09-09 12:29:36.0,5.0,5.0,,1,0,0,1,[SkillRule: Remove positive coefficient; {ax/b...,2~~4


In [18]:
df = df_all[['Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 
             'Step Name', 'KC(Default)', 'First Transaction Time',
             # 'Correct Transaction Time'
            ]]

In [19]:
df = df.dropna()

In [20]:
df.shape

(607025, 8)

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [21]:
skipped = []
def get_kcs(row):
    text = row['KC(Default)'].replace(',','')
    kc_list = text.split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[ ,/?-]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', item, flags=re.M)
        # Option 3: get what couldn't be extracted from previous patterns
        found_opt3 = re.findall(r'^\[SkillRule: (.*);.*', item, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        elif found_opt3:
            kc.append(found_opt3[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [22]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

CPU times: user 30.1 s, sys: 192 ms, total: 30.3 s
Wall time: 30.3 s


In [23]:
%%time
kcs = []
for row in df['KC'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

CPU times: user 8min 11s, sys: 312 ms, total: 8min 11s
Wall time: 8min 11s


Save KCs

In [24]:
with open('%s/kdd_2005_2006_kcs_time.pkl' % folder, 'wb') as pklfile:
    pickle.dump(kcs, pklfile)
len(kcs)

110

Construct Q-Matrix

In [25]:
%%time
q_matrix = np.zeros((len(steps), len(kcs)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(176630, 110)
CPU times: user 1min 44s, sys: 260 ms, total: 1min 44s
Wall time: 1min 44s


Save data

In [26]:
%%time
with open('%s/kdd_2005_2006_df_time.pkl' % folder, 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('%s/kdd_2005_2006_q_matrix_time.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

CPU times: user 1.08 s, sys: 473 ms, total: 1.55 s
Wall time: 2.85 s


In [27]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id', 'First Transaction Time']].values.tolist()
students = df['Anon Student Id'].unique().tolist()
len(students)

CPU times: user 569 ms, sys: 32 ms, total: 601 ms
Wall time: 597 ms


574

In [29]:
with open('%s/kdd_2005_2006_train_data_time.pkl' % folder, 'wb') as pklfile:
    pickle.dump(data, pklfile)
with open('%s/kdd_2005_2006_students_time.pkl' % folder, 'wb') as pklfile:
    pickle.dump(students, pklfile)

In [30]:
data

[[0, '0BrbPbwCMz', 0, '2005-09-09 12:24:49.0'],
 [1, '0BrbPbwCMz', 1, '2005-09-09 12:25:31.0'],
 [0, '0BrbPbwCMz', 2, '2005-09-09 12:25:43.0'],
 [1, '0BrbPbwCMz', 3, '2005-09-09 12:26:34.0'],
 [1, '0BrbPbwCMz', 4, '2005-09-09 12:28:36.0'],
 [0, '0BrbPbwCMz', 5, '2005-09-09 12:28:43.0'],
 [1, '0BrbPbwCMz', 6, '2005-09-09 12:29:09.0'],
 [1, '0BrbPbwCMz', 7, '2005-09-09 12:29:14.0'],
 [1, '0BrbPbwCMz', 8, '2005-09-09 12:29:31.0'],
 [1, '0BrbPbwCMz', 9, '2005-09-09 12:29:36.0'],
 [1, '0BrbPbwCMz', 10, '2005-09-09 12:30:27.0'],
 [0, '0BrbPbwCMz', 11, '2005-09-09 12:30:34.0'],
 [1, '0BrbPbwCMz', 12, '2005-09-09 12:31:04.0'],
 [1, '0BrbPbwCMz', 13, '2005-09-09 12:31:07.0'],
 [1, '0BrbPbwCMz', 14, '2005-09-09 12:31:29.0'],
 [1, '0BrbPbwCMz', 15, '2005-09-09 12:31:32.0'],
 [1, '0BrbPbwCMz', 16, '2005-09-09 12:31:44.0'],
 [1, '0BrbPbwCMz', 17, '2005-09-09 12:31:46.0'],
 [0, '0BrbPbwCMz', 18, '2005-09-09 12:33:22.0'],
 [1, '0BrbPbwCMz', 19, '2005-09-09 12:33:37.0'],
 [1, '0BrbPbwCMz', 20, '2005-0