# Process Junyi Dataset

In [88]:
import numpy as np
import pandas as pd
from collections import defaultdict
from utils import min_seq_len_filter

In [4]:
MIN_SEQ_LENGTH = 10

problem_log_path = "raw-datasets/junyi_ProblemLog_original.csv"
exercise_table_path = "raw-datasets/junyi_Exercise_table.csv"

In [51]:
df = pd.read_csv(problem_log_path)

# limit to first attempts on non-review questions
ix = (df['problem_number'] == 1) & (df['review_mode'] == False)

df = df[ix]

print("# exercises: %d" % len(set(df['exercise'])))
print("# problem types: %d" % len(set(df['problem_type'])))
print("# students: %d" % len(set(df['user_id'])))
print("# trials: %d" % df.shape[0])

# exercises: 722
# problem types: 1150
# students: 246952
# trials: 2279215


In [71]:
exercise_df = pd.read_csv(exercise_table_path)
exercise_df = exercise_df.drop_duplicates(['name']).set_index('name')

In [86]:
def make_indecies(series):
    unique_vals = sorted(set(series))
    return dict(zip(unique_vals, range(len(unique_vals))))
def make_mapping(keys_series, vals_series):
    mapping = defaultdict(set)
    for k, v in zip(keys_series, vals_series):
        mapping[k].add(v)
    return dict(mapping)

df['student'] = np.array(df['user_id'])
df['problem'] = np.array(df['exercise'] + '--' + df['problem_type'])
df['skill'] = np.array(exercise_df.loc[df['exercise'], 'topic'].astype(str))
df['end_time'] = pd.to_datetime(df['time_done'], unit='us')
df['response_time_ms'] = np.maximum(0, df['time_taken']) * 1000

problem_id = make_indecies(df['problem'])
student_id = make_indecies(df['student'])
skill_id = make_indecies(df['skill'])

In [89]:
rows = []
for r in df.itertuples():
    rows.append({
        "student" : student_id[r.student],
        "skill" : skill_id[r.skill],
        "correct" : int(r.correct),
        "end_time" : r.end_time,
        "problem" : problem_id[ r.problem ],
        "response_time_ms" : r.response_time_ms,
        "skill_name" : r.skill
    })
output_df = pd.DataFrame(rows)
output_df = min_seq_len_filter(output_df, MIN_SEQ_LENGTH)

In [90]:
print("Trials: %d" % output_df.shape[0])
print("Students: %d" % len(set(output_df['student'])))
print("Skills: %d" % len(set(output_df['skill'])))
print("Problems: %d" % len(set(output_df['problem'])))
date = pd.to_datetime(output_df['end_time'])
min_time = np.min(date)
max_time = np.max(date)
print("Time range: %s to %s" % (min_time, max_time))

Trials: 1773518
Students: 55453
Skills: 40
Problems: 2526
Time range: 2012-10-12 06:23:25.827490 to 2015-01-11 18:13:19.900800


In [91]:
output_df.to_csv("datasets/junyi.csv", index=False)