In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pickle

### Reading KDD train file
Drop rows with null in any of the important columns

In [3]:
folder = 'data/kdd/2008-2009'

In [19]:
%%time
df_all = pd.read_csv('%s/algebra_2008_2009_test.txt' % folder, delimiter='\t')
print(df_all.shape)

(508912, 23)
Wall time: 1.9 s


In [20]:
df = df_all[['Anon Student Id', 'Correct First Attempt', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'KC(SubSkills)']]

In [22]:
df = df.dropna(axis=0, subset=['KC(SubSkills)'])

In [23]:
df

Unnamed: 0,Anon Student Id,Correct First Attempt,Problem Hierarchy,Problem Name,Problem View,Step Name,KC(SubSkills)
3,stu_de2777346f,,"Unit CTA1_01, Section CTA1_01-4",L1FB12,1,R2C2,Identifying units
4,stu_de2777346f,,"Unit CTA1_01, Section CTA1_01-4",L1FB12,1,R3C1,Define Variable
5,stu_de2777346f,,"Unit CTA1_01, Section CTA1_01-4",L1FB12,1,R3C2,"Write expression, any form~~Write expression, ..."
6,stu_de2777346f,,"Unit CTA1_01, Section CTA1_01-4",L1FB12,1,R4C1,"Entering a given~~Enter given, reading numerals"
7,stu_de2777346f,,"Unit CTA1_01, Section CTA1_01-4",L1FB12,1,R4C2,"Using simple numbers~~Find Y, any form~~Find Y..."
...,...,...,...,...,...,...,...
508906,stu_6af5d5e304,,"Unit LINEAR-INEQUALITY-GRAPHING, Section LINEA...",2X+3YGT9,1,XIntercept1,"Positive Constants, GLF~~Entering x-intercept,..."
508907,stu_6af5d5e304,,"Unit LINEAR-INEQUALITY-GRAPHING, Section LINEA...",2X+3YGT9,1,YIntercept1,"Positive Constants, GLF~~Entering y-intercept,..."
508909,stu_6af5d5e304,,"Unit LINEAR-INEQUALITY-GRAPHING, Section LINEA...",2X+3YGT9,1,ShadeButton1,Shading greater than~~Excluding the line when ...
508910,stu_6af5d5e304,,"Unit LINEAR-INEQUALITY-GRAPHING, Section LINEA...",2X+3YGT9,1,Formula 1 XIntercept,Placing coordinate point


In [13]:
df.shape

(353798, 7)

### Preprocessing
Transform dataframe in a format understood by the BKT module

In [14]:
skipped = []
def get_kcs(row):
    text = row['KC(SubSkills)'].replace(',','')
    kc_list = text.split('~~')
    kc = []
    for item in kc_list:
        # Option 1: with SkillRule prefix
        found_opt1 = re.findall(r'^\[SkillRule: ([\w ]+[ ,/?-]*[\w ]*);.*', item, flags=re.M)
        # Option 2: regular words (hyphenized or spaced)
        found_opt2 = re.findall(r'^([\w -]+).*', item, flags=re.M)
        # Option 3: get what couldn't be extracted from previous patterns
        found_opt3 = re.findall(r'^\[SkillRule: (.*);.*', item, flags=re.M)
        if found_opt1:
            kc.append(found_opt1[0])
        elif found_opt2:
            kc.append(found_opt2[0])
        elif found_opt3:
            kc.append(found_opt3[0])
        else:
            skipped.append(item)
    return kc

def get_steps(row):
    step = row['Problem Hierarchy'] + '_' + row["Problem Name"] + '_' + row['Step Name']
    return step

Separate KCs in a list and create an identifier name for the step

In [32]:
%%time
df['KC'] = df.apply(get_kcs, axis=1)
df['step'] = df.apply(get_steps, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Wall time: 32 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [30]:
with open('%s/kdd_2008_2009_kcs.pkl' % folder, 'rb') as pklfile:
    kcs_train = pickle.load(pklfile)

In [33]:
%%time
steps = df['step'].unique().tolist()

def create_data(row):
    question_id = steps.index(row['step'])
    return question_id

df['question_id'] = df.apply(create_data, axis=1)

Wall time: 2min 2s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Construct Q-Matrix

In [36]:
%%time
q_matrix = np.zeros((len(steps), len(kcs_train)))

for idx, item in df.iterrows():
    for kc in item['KC']:
        kc_col = kcs_train.index(kc)
        q_matrix[item['question_id'], kc_col] = 1
    
print(q_matrix.shape)

(75153, 536)
Wall time: 1min 5s


Save data

In [37]:
%%time
with open('%s/kdd_2008_2009_test_df.pkl' % folder, 'wb') as pklfile:
    pickle.dump(df, pklfile)
with open('%s/kdd_2008_2009_test_q_matrix.pkl' % folder, 'wb') as pklfile:
    pickle.dump(q_matrix, pklfile)

Wall time: 52.8 s


In [5]:
%%time
# Data matrix
data = df[['Correct First Attempt', 'Anon Student Id', 'question_id']].values.tolist()
students = df['Anon Student Id'].unique().tolist()
len(students)

Wall time: 333 ms


3232

In [6]:
%%time
with open('%s/kdd_2008_2009_test_data.pkl' % folder, 'wb') as pklfile:
    pickle.dump(data, pklfile)
with open('%s/kdd_2008_2009_test_students.pkl' % folder, 'wb') as pklfile:
    pickle.dump(students, pklfile)

Wall time: 170 ms
