# Process KDD 2010 Dataset

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from dateutil import parser
from utils import min_seq_len_filter

In [2]:
MIN_SEQ_LENGTH = 10

In [3]:
path = "./raw-datasets/algebra_2005_2006_train.txt"
kc_col = "KC(Default)"
df = pd.read_csv(path, sep='\t')
ix = ~df[kc_col].isnull()
df = df[ix]    

In [4]:
df

Unnamed: 0,Row,Anon Student Id,Problem Hierarchy,Problem Name,Problem View,Step Name,Step Start Time,First Transaction Time,Correct Transaction Time,Step End Time,Step Duration (sec),Correct Step Duration (sec),Error Step Duration (sec),Correct First Attempt,Incorrects,Hints,Corrects,KC(Default),Opportunity(Default)
0,1,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,3(x+2) = 15,2005-09-09 12:24:35.0,2005-09-09 12:24:49.0,2005-09-09 12:25:15.0,2005-09-09 12:25:15.0,40.0,,40.0,0,2,3,1,[SkillRule: Eliminate Parens; {CLT nested; CLT...,1
1,2,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG4-FIXED,1,x+2 = 5,2005-09-09 12:25:15.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,2005-09-09 12:25:31.0,16.0,16.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",1~~1
2,3,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,2-8y = -4,2005-09-09 12:25:36.0,2005-09-09 12:25:43.0,2005-09-09 12:26:12.0,2005-09-09 12:26:12.0,36.0,,36.0,0,2,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",2
3,4,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,1,-8y = -6,2005-09-09 12:26:12.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,2005-09-09 12:26:34.0,22.0,22.0,,1,0,0,1,"[SkillRule: Remove coefficient; {ax+b=c, divid...",1~~1
4,5,0BrbPbwCMz,"Unit ES_04, Section ES_04-1",EG40,2,-7y-5 = -4,2005-09-09 12:26:38.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,2005-09-09 12:28:36.0,118.0,118.0,,1,0,0,1,"[SkillRule: Remove constant; {ax+b=c, positive...",3~~1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809688,1080611,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x = 4,2006-03-09 10:52:45.0,2006-03-09 10:57:52.0,2006-03-09 10:58:05.0,2006-03-09 10:58:05.0,320.0,,320.0,0,0,3,1,"[SkillRule: Remove constant; {ax+b=c, positive...",4~~2
809689,1080612,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,-7+2x+7 = 4+7,2006-03-09 10:58:05.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,2006-03-09 10:58:13.0,8.0,8.0,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],5
809690,1080613,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 4+7,2006-03-09 10:58:13.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,2006-03-09 10:58:18.0,5.0,5.0,,1,0,0,1,[SkillRule: Consolidate vars with coeff; CLT],6
809691,1080614,IQQo3367x0,"Unit ES_02, Section ES_02-5",EG40,4,2x = 11,2006-03-09 10:58:18.0,2006-03-09 10:58:22.0,2006-03-09 10:58:39.0,2006-03-09 10:58:39.0,21.0,,21.0,0,1,3,1,[SkillRule: Remove positive coefficient; {ax/b...,1~~4


In [5]:
df['corr_dur_sec'] = df['Correct Step Duration (sec)'].fillna(0).astype(float)
df['error_dur_sec'] = df['Error Step Duration (sec)'].replace(0).astype(float)
df['problem'] = df['Problem Hierarchy'].astype(str) + '>>' + df['Problem Name'].astype(str) + '>>' + df['Step Name'].astype(str)

In [6]:
def make_indecies(series):
    unique_vals = sorted(set(series))
    return dict(zip(unique_vals, range(len(unique_vals))))
def make_mapping(keys_series, vals_series):
    mapping = defaultdict(set)
    for k, v in zip(keys_series, vals_series):
        mapping[k].add(v)
    return dict(mapping)

In [7]:
problem_id = make_indecies(df['problem'])
student_id = make_indecies(df['Anon Student Id'])

df['end_time'] = pd.to_datetime(df['Step End Time'])
df['student'] = df['Anon Student Id']
df['correct'] = (df['Correct First Attempt'] == 1).astype(int)
df['response_time_ms'] = (df['correct'] * df['corr_dur_sec'] + (1-df['correct']) * df['error_dur_sec']) * 1000
df['skill'] = ['~~'.join(sorted(kc.split('~~'))) for kc in df[kc_col]]

skill_id = make_indecies(df['skill'])

In [8]:
rows = []
for r in df.itertuples():
    rows.append({
        "student" : student_id[r.student],
        "skill" : skill_id[r.skill],
        "correct" : r.correct,
        "end_time" : r.end_time,
        "problem" : problem_id[ r.problem ],
        "response_time_ms" : r.response_time_ms,
        "skill_name" : r.skill
    })
output_df = pd.DataFrame(rows)
output_df = min_seq_len_filter(output_df, MIN_SEQ_LENGTH)

In [9]:
print("Trials: %d" % output_df.shape[0])
print("Students: %d" % len(set(output_df['student'])))
print("Skills: %d" % len(set(output_df['skill'])))
print("Problems: %d" % len(set(output_df['problem'])))
date = pd.to_datetime(output_df['end_time'])
min_time = np.min(date)
max_time = np.max(date)
print("Time range: %s to %s" % (min_time, max_time))

Trials: 606983
Students: 567
Skills: 291
Problems: 176630
Time range: 2005-08-30 09:50:46 to 2006-06-07 11:12:38


In [10]:
output_df.to_csv("datasets/kdd2010.csv", index=False)