# Process Statics 2011 Dataset

In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
from dateutil import parser
from utils import min_seq_len_filter

In [2]:
MIN_SEQ_LENGTH = 10

In [3]:
path = "./raw-datasets/ds507_student_step_All_Data_1664_2017_0227_035544.txt"
kc_col = "KC (F2011)"
df = pd.read_csv(path, sep='\t')
ix = ~df[kc_col].isnull()
df = df[ix]    

In [4]:
df['corr_dur_sec'] = df['Correct Step Duration (sec)'].replace('.', 0).astype(float)
df['error_dur_sec'] = df['Error Step Duration (sec)'].replace('.', 0).astype(float)
df['problem'] = df['Problem Hierarchy'].astype(str) + '>>' + df['Problem Name'].astype(str) + '>>' + df['Step Name'].astype(str)

In [5]:
def make_indecies(series):
    unique_vals = sorted(set(series))
    return dict(zip(unique_vals, range(len(unique_vals))))
def make_mapping(keys_series, vals_series):
    mapping = defaultdict(set)
    for k, v in zip(keys_series, vals_series):
        mapping[k].add(v)
    return dict(mapping)

In [6]:
problem_id = make_indecies(df['problem'])
student_id = make_indecies(df['Anon Student Id'])
skill_id = make_indecies(df[kc_col])
df['end_time'] = pd.to_datetime(df['Step End Time'])
df['student'] = df['Anon Student Id']
df['correct'] = (df['First Attempt'] == 'correct').astype(int)
df['response_time_ms'] = (df['correct'] * df['corr_dur_sec'] + (1-df['correct']) * df['error_dur_sec']) * 1000
df['skill'] = df[kc_col]

In [7]:
rows = []
for r in df.itertuples():
    rows.append({
        "student" : student_id[r.student],
        "skill" : skill_id[r.skill],
        "correct" : r.correct,
        "end_time" : r.end_time,
        "problem" : problem_id[ r.problem ],
        "response_time_ms" : r.response_time_ms,
        "skill_name" : r.skill
    })
output_df = pd.DataFrame(rows)
output_df = min_seq_len_filter(output_df, MIN_SEQ_LENGTH)

In [9]:
print("Trials: %d" % output_df.shape[0])
print("Students: %d" % len(set(output_df['student'])))
print("Skills: %d" % len(set(output_df['skill'])))
print("Problems: %d" % len(set(output_df['problem'])))
date = pd.to_datetime(output_df['end_time'])
min_time = np.min(date)
max_time = np.max(date)
print("Time range: %s to %s" % (min_time, max_time))

Trials: 113984
Students: 330
Skills: 97
Problems: 633
Time range: 2011-08-22 13:36:15 to 2011-12-08 21:49:21


In [10]:
output_df.to_csv("datasets/statics2011.csv", index=False)

97