In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

### Reading KDD train file
Drop rows with null in any of the important columns

In [2]:
folder = 'data/kdd/2005-2006'

In [3]:
%%time
df_all = pd.read_csv('%s/algebra_2005_2006_master.txt' % folder, delimiter='\t')
print(df_all.shape)

(3967, 19)
CPU times: user 78.1 ms, sys: 15.6 ms, total: 93.8 ms
Wall time: 82.5 ms


In [4]:
df = df_all[['Row', 'Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'KC(Default)']]

In [5]:
df = df.dropna(subset=['KC(Default)'])
df = df.reset_index()

In [6]:
df.shape

(2954, 9)

In [7]:
idx_df = df[df["Row"] == int(1080617)].index.values.astype(int)[0]
idx_df

2953

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [8]:
skipped = []
def get_kcs(row):
    text = row['KC(Default)'].replace(',','')
    kc_list = text.split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[ ,/?-]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', item, flags=re.M)
        # Option 3: get what couldn't be extracted from previous patterns
        found_opt3 = re.findall(r'^\[SkillRule: (.*);.*', item, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        elif found_opt3:
            kc.append(found_opt3[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [9]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

CPU times: user 203 ms, sys: 0 ns, total: 203 ms
Wall time: 190 ms


In [10]:
%%time
kcs = []
for row in df['KC'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

CPU times: user 109 ms, sys: 15.6 ms, total: 125 ms
Wall time: 112 ms


Construct Q-Matrix

In [12]:
with open('%s/kdd_2005_2006_kcs.pkl' % folder, 'rb') as pklfile:
    kcs_train = pickle.load(pklfile)

In [13]:
%%time
q_matrix = np.zeros((len(steps), len(kcs_train)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs_train.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(2391, 110)
CPU times: user 469 ms, sys: 0 ns, total: 469 ms
Wall time: 462 ms


Save data

In [14]:
with open('%s/kdd_2005_2006_test_df.pkl' % folder, 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('%s/kdd_2005_2006_test_q_matrix.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

In [15]:
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()
students = df['Anon Student Id'].unique().tolist()
len(students)

551

In [16]:
with open('%s/kdd_2005_2006_test_data.pkl' % folder, 'wb') as pklfile:
    pickle.dump(data, pklfile)
with open('%s/kdd_2005_2006_test_students.pkl' % folder, 'wb') as pklfile:
    pickle.dump(students, pklfile)