In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

### Reading KDD train file
Drop rows with null in any of the important columns

In [2]:
folder = 'data/kdd/2008-2009'

In [3]:
%%time
df_all = pd.read_csv('%s/algebra_2008_2009_train.txt' % folder, delimiter='\t')
print(df_all.shape)

(8918054, 23)
CPU times: user 40.3 s, sys: 10.4 s, total: 50.8 s
Wall time: 43.8 s


In [4]:
df = df_all[['Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'KC(SubSkills)']]

In [5]:
df = df.dropna()

In [6]:
df.shape

(6442137, 7)

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [7]:
skipped = []
def get_kcs(row):
    text = row['KC(SubSkills)'].replace(',','')
    kc_list = text.split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[ ,/?-]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', item, flags=re.M)
        # Option 3: get what couldn't be extracted from previous patterns
        found_opt3 = re.findall(r'^\[SkillRule: (.*);.*', item, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        elif found_opt3:
            kc.append(found_opt3[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [8]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

CPU times: user 5min 47s, sys: 25 s, total: 6min 13s
Wall time: 6min 13s


In [9]:
%%time
kcs = []
for row in df['KC'].iteritems():
    kcs.extend(row[1])
kcs = list(set(kcs))
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

CPU times: user 4h 14min 4s, sys: 5.05 s, total: 4h 14min 9s
Wall time: 4h 14min 8s


In [10]:
kcs

['Convert decimal units greater than one',
 'Plot imperfect radical',
 'Find difference of y-coordinates',
 'Edit Algebraic k in N problem',
 'Choose operator in fractional identity',
 'Compare two large numbers in scientific notation',
 'Compare differences - added internal',
 'Enter fractional probability of zero',
 'Enter given leg out of context',
 'Choose table size',
 'Find positive square root',
 'Consolidate vars no coeff',
 'Plot non-terminating proper fraction',
 'Enter experimental probability for medium number',
 'Enter probability of first event',
 'Do Multiply - Whole (typein-expression-2)',
 'Enter ratio using a colon',
 'Identify solution type of compound inequality using and',
 'Find slope of horizontal line',
 'Identify type of solution',
 'Labelling point of intersection',
 'Identify lower perfect square',
 'done no solutions',
 'Identify quadratic Parent Equation',
 'Plot point on minor tick mark - fractional major',
 'Convert small number units',
 'Write positive p

Save KCs

In [11]:
with open('%s/kdd_2008_2009_kcs.pkl' % folder, 'wb') as pklfile:
    pickle.dump(kcs, pklfile)
len(kcs)

536

Construct Q-Matrix

In [12]:
%%time
q_matrix = np.zeros((len(steps), len(kcs)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(819699, 536)
CPU times: user 13min 54s, sys: 9.11 s, total: 14min 3s
Wall time: 13min 57s


Save data

In [13]:
%%time
with open('%s/kdd_2008_2009_df.pkl' % folder, 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('%s/kdd_2008_2009_q_matrix.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

CPU times: user 9.91 s, sys: 12.2 s, total: 22.1 s
Wall time: 33.8 s


In [14]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()
students = df['Anon Student Id'].unique().tolist()
len(students)

CPU times: user 5.91 s, sys: 1.77 s, total: 7.67 s
Wall time: 7.71 s


3292

In [15]:
%%time
with open('%s/kdd_2008_2009_train_data.pkl' % folder, 'wb') as pklfile:
    pickle.dump(data, pklfile)
with open('%s/kdd_2008_2009_students.pkl' % folder, 'wb') as pklfile:
    pickle.dump(students, pklfile)

CPU times: user 3.06 s, sys: 688 ms, total: 3.75 s
Wall time: 4.06 s
