In [4]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

### Reading KDD train file
Drop rows with null in any of the important columns

In [2]:
folder = 'data/kdd/2008-2009'

In [3]:
%%time
df_all = pd.read_csv('%s/algebra_2008_2009_train.txt' % folder, delimiter='\t')
print(df_all.shape)

(8918054, 23)
Wall time: 43.9 s


In [5]:
df = df_all[['Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'KC(SubSkills)']]

In [6]:
df = df.dropna()

In [7]:
df.shape

(6442137, 7)

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [8]:
skipped = []
def get_kcs(row):
    text = row['KC(SubSkills)'].replace(',','')
    kc_list = text.split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[ ,/?-]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', item, flags=re.M)
        # Option 3: get what couldn't be extracted from previous patterns
        found_opt3 = re.findall(r'^\[SkillRule: (.*);.*', item, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        elif found_opt3:
            kc.append(found_opt3[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [9]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

Wall time: 9min 35s


In [10]:
%%time
kcs = []
for row in df['KC'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

Wall time: 4h 57min 28s


In [57]:
kcs

['Plot imperfect radical',
 'Write expression ratio',
 'Using simple numbers',
 'Enter fractional probability of one',
 'Isolate positive',
 'Enter mean',
 'Solve equation having one solution',
 'perform-mult-whole-sp',
 'Choose Graphical refl-v in V problem',
 'Determine that mean does not make sense - below range',
 'Write expression any form',
 'Enter given reading numerals',
 'Enter given slope',
 'Choose Graphical a in V problem',
 'Write base of exponential from given whole number as product',
 'Enter second extreme in equation',
 'Identify when finished with numberline',
 'Find midpoint using points in quadrants 2 or 4',
 'Compare point and original mean - added outlier',
 'Enter numerator of converted unit',
 'Write negative exponent of exponential from given number as product',
 'Plot non-terminating mixed number',
 'Edit Algebraic a in G problem',
 'Plot perfect radical',
 'Find slope from two given points',
 'Identify upper square root as whole number',
 'Enter point-slope f

Save KCs

In [12]:
with open('%s/kdd_2008_2009_kcs.pkl' % folder, 'wb') as pklfile:
    pickle.dump(kcs, pklfile)
len(kcs)

536

Construct Q-Matrix

In [13]:
%%time
q_matrix = np.zeros((len(steps), len(kcs)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(819699, 536)
Wall time: 19min 3s


Save data

In [14]:
%%time
with open('%s/kdd_2008_2009_df.pkl' % folder, 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('%s/kdd_2008_2009_q_matrix.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

Wall time: 7min 49s


In [7]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()
students = df['Anon Student Id'].unique().tolist()
len(students)

Wall time: 4.18 s


3292

In [8]:
%%time
with open('%s/kdd_2008_2009_train_data.pkl' % folder, 'wb') as pklfile:
    pickle.dump(data, pklfile)
with open('%s/kdd_2008_2009_students.pkl' % folder, 'wb') as pklfile:
    pickle.dump(students, pklfile)

Wall time: 19.2 s
