In [1]:
from config import *
import pandas as pd
import numpy as np
from pykt.preprocess.split_datasets import *
from typing import List
from main_package.utils import data_path_to_abs_path
from main_package.utils import truncate_interaction_sequences
from main_package.bkt_pyKT import convert_df_strings_to_arrays
from main_package.bkt_pyKT_per_skill import train_bkt, evaluate_bkt

In [2]:
data_folder = data_path_to_abs_path('isaac/pyKT_processed/cross_val')

# dataset split

In [3]:
def train_test_split(df, test_ratio, random_state):
    df = df.sample(frac=1.0, random_state=random_state)
    datanum = df.shape[0]
    test_num = int(datanum * test_ratio)
    train_num = datanum - test_num
    train_df = df[0:train_num]
    test_df = df[train_num:]
    # report
    print(f"total num: {datanum}, train+valid num: {train_num}, test num: {test_num}")
    return train_df, test_df

def split_concept(dname, fname, dataset_name, configf, random_state: int, reversed: bool, min_seq_len = 3, maxlen = 200, kfold = 5):
    """split main function

    Args:
        dname (str): data folder path
        fname (str): the data file used to split, needs 6 columns, format is: (NA indicates the dataset has no corresponding info)
            uid,seqlen: 50121,4
            quetion ids: NA
            concept ids: 7014,7014,7014,7014
            responses: 0,1,1,1
            timestamps: NA
            cost times: NA
        dataset_name (str): dataset name
        configf (str): the dataconfig file path
        min_seq_len (int, optional): the min seqlen, sequences less than this value will be filtered out. Defaults to 3.
        maxlen (int, optional): the max seqlen. Defaults to 200.
        kfold (int, optional): the folds num needs to split. Defaults to 5.
        
    """
    stares = []

    total_df, effective_keys = read_data(fname)
    #cal max_concepts
    if 'concepts' in effective_keys:
        max_concepts = get_max_concepts(total_df)
    else:
        max_concepts = -1

    oris, _, qs, cs, seqnum = calStatistics(total_df, stares, "original")
    print("="*20)
    print(f"original total interactions: {oris}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")

    total_df, effective_keys = extend_multi_concepts(total_df, effective_keys)
    total_df, dkeyid2idx = id_mapping(total_df)
    dkeyid2idx["max_concepts"] = max_concepts

    extends, _, qs, cs, seqnum = calStatistics(total_df, stares, "extend multi")
    print("="*20)
    print(f"after extend multi, total interactions: {extends}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")

    save_id2idx(dkeyid2idx, os.path.join(dname, "keyid2idx.json"))
    effective_keys.add("fold")
    config = []
    for key in ALL_KEYS:
        if key in effective_keys:
            config.append(key)
    # train test split & generate sequences
    # change starts
    # train_df, test_df = train_test_split(total_df, 0.2)
    train_df, test_df = train_test_split(total_df, test_ratio=0.5, random_state=random_state)
    if reversed:
      train_df, test_df = test_df, train_df
    # change ends
    splitdf = KFold_split(train_df, kfold)
    # TODO
    splitdf[config].to_csv(os.path.join(dname, "train_valid.csv"), index=None)
    ins, ss, qs, cs, seqnum = calStatistics(splitdf, stares, "original train+valid")
    print(f"train+valid original interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
    split_seqs = generate_sequences(splitdf, effective_keys, min_seq_len, maxlen)
    ins, ss, qs, cs, seqnum = calStatistics(split_seqs, stares, "train+valid sequences")
    print(f"train+valid sequences interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
    split_seqs.to_csv(os.path.join(dname, "train_valid_sequences.csv"), index=None)
    # print(f"split seqs dtypes: {split_seqs.dtypes}")

    # add default fold -1 to test!
    test_df["fold"] = [-1] * test_df.shape[0]  
    test_df['cidxs'] = get_inter_qidx(test_df)#add index  
    test_seqs = generate_sequences(test_df, list(effective_keys) + ['cidxs'], min_seq_len, maxlen)
    ins, ss, qs, cs, seqnum = calStatistics(test_df, stares, "test original")
    print(f"original test interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
    ins, ss, qs, cs, seqnum = calStatistics(test_seqs, stares, "test sequences")
    print(f"test sequences interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
    print("="*20)

    #test_window_seqs = generate_window_sequences(test_df, list(effective_keys) + ['cidxs'], maxlen)
    flag, test_question_seqs = generate_question_sequences(test_df, effective_keys, False, min_seq_len, maxlen)
    #flag, test_question_window_seqs = generate_question_sequences(test_df, effective_keys, True, min_seq_len, maxlen)
    
    test_df = test_df[config+['cidxs']]

    test_df.to_csv(os.path.join(dname, "test.csv"), index=None)
    test_seqs.to_csv(os.path.join(dname, "test_sequences.csv"), index=None)
    #test_window_seqs.to_csv(os.path.join(dname, "test_window_sequences.csv"), index=None)

    #ins, ss, qs, cs, seqnum = calStatistics(test_window_seqs, stares, "test window")
    #print(f"test window interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
    
    if flag:
        test_question_seqs.to_csv(os.path.join(dname, "test_question_sequences.csv"), index=None)
        #test_question_window_seqs.to_csv(os.path.join(dname, "test_question_window_sequences.csv"), index=None)
        
        ins, ss, qs, cs, seqnum = calStatistics(test_question_seqs, stares, "test question")
        print(f"test question interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
        #ins, ss, qs, cs, seqnum = calStatistics(test_question_window_seqs, stares, "test question window")
        #print(f"test question window interactions num: {ins}, select num: {ss}, qs: {qs}, cs: {cs}, seqnum: {seqnum}")
   
    write_config(dataset_name=dataset_name, dkeyid2idx=dkeyid2idx, effective_keys=effective_keys, 
                configf=configf, dpath = dname, k=kfold,min_seq_len = min_seq_len, maxlen=maxlen,flag=flag)
    
    print("="*20)
    print("\n".join(stares))

In [4]:
import os
from pykt.preprocess import process_raw_data
import json

folder = "/home/miroslav/part-II-project/pykt-toolkit"

dname2paths = {
    "assist2009": f"{data_folder}/skill_builder_data_corrected_collapsed.csv",
}
configf = f"{folder}/configs/data_config.json"

def write_empty_data_config(dataset_name: str):
    with open(f'{folder}/configs/data_config.json', 'w') as f:
      json.dump({dataset_name: {}}, f)

def pre_process(dataset_name='assist2009', min_seq_len=3, maxlen=200, reversed=False, random_state=1024):
    write_empty_data_config(dataset_name)
    
    # process raw data
    dname, writef = process_raw_data(dataset_name, dname2paths)
    print("-"*50)
    # split
    os.system("rm " + dname + "/*.pkl")

    #for concept level model
    split_concept(dname, writef, dataset_name, configf, min_seq_len=min_seq_len, maxlen=maxlen, reversed=reversed, random_state=random_state)
    print("="*100)

# train and eval

In [5]:
def load_truncated_datasets(name: str):
    df = pd.read_csv(f'{data_folder}/{name}.csv')
    convert_df_strings_to_arrays(df)
    df = truncate_interaction_sequences(df)
    return df

In [6]:
def cross_validate(random_states: List[int]):
  print(f'random_states: {random_states}')
  result_dicts = []
  for random_state in random_states:
    for is_reversed in [True, False]:
      # prepare
      pre_process(reversed=is_reversed, random_state=random_state)
      df_train = load_truncated_datasets('train_valid')
      df_test = load_truncated_datasets('test')

      # train
      bkt_params_dict = train_bkt(df_train)
      bkt_avg_params = np.average([value for value in bkt_params_dict.values()], axis=0)

      #eval
      auc, accuracy = evaluate_bkt(bkt_params_dict, df_test, bkt_avg_params)
      result_dicts.append({'auc': auc, 'accuracy': accuracy})


  return result_dicts

In [7]:
random_states = [0, 2, 42, 111, 256]

In [8]:
result_dicts = cross_validate(random_states)

random_states: [0, 2, 42, 111, 256]
Start preprocessing data: assist2009
original interaction num: 480534, user num: 46612, question num: 536, concept num: 94, avg(ins) per s: 10.3092, avg(c) per q: 2.4272, na: 0
after drop interaction num: 480534, user num: 46612, question num: 536, concept num: 94, avg(ins) per s: 10.3092, avg(c) per q: 2.4272, na: 0


  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 576452, select num: 0, qs: 521, cs: 94, seqnum: 14994
dropnum: 5
train+valid sequences interactions num: 576447, select num: 576447, qs: 521, cs: 94, seqnum: 15548
dropnum: 6
original test interactions num: 570283, select num: 0, qs: 525, cs: 92, seqnum: 14994
test sequences interactions num: 570277, select num: 570277, q

  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 570283, select num: 0, qs: 525, cs: 92, seqnum: 14994
dropnum: 6
train+valid sequences interactions num: 570277, select num: 570277, qs: 525, cs: 92, seqnum: 15503
dropnum: 5
original test interactions num: 576452, select num: 0, qs: 521, cs: 94, seqnum: 14994
test sequences interactions num: 576447, select num: 576447, q

  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 574739, select num: 0, qs: 528, cs: 92, seqnum: 14994
dropnum: 8
train+valid sequences interactions num: 574731, select num: 574731, qs: 528, cs: 92, seqnum: 15547
dropnum: 3
original test interactions num: 571996, select num: 0, qs: 520, cs: 94, seqnum: 14994
test sequences interactions num: 571993, select num: 571993, q

  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 571996, select num: 0, qs: 520, cs: 94, seqnum: 14994
dropnum: 3
train+valid sequences interactions num: 571993, select num: 571993, qs: 520, cs: 94, seqnum: 15504
dropnum: 8
original test interactions num: 574739, select num: 0, qs: 528, cs: 92, seqnum: 14994
test sequences interactions num: 574731, select num: 574731, q

  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 575423, select num: 0, qs: 536, cs: 94, seqnum: 14994
dropnum: 7
train+valid sequences interactions num: 575416, select num: 575416, qs: 536, cs: 94, seqnum: 15524
dropnum: 4
original test interactions num: 571312, select num: 0, qs: 512, cs: 92, seqnum: 14994
test sequences interactions num: 571308, select num: 571308, q

  for ui in ui_df:


480534,46612,536,94,10.3092,2.4272,0
480534,46612,536,94,10.3092,2.4272,0
--------------------------------------------------


rm: cannot remove '/home/miroslav/part-II-project/data/isaac/pyKT_processed/cross_val/*.pkl': No such file or directory


delete bad stu num of len: 16624, delete interactions: 22934, of r: 0, good num: 457600
original total interactions: 457600, qs: 536, cs: 94, seqnum: 29988
df.columns: Index(['uid', 'is_repeat', 'concepts', 'responses', 'questions'], dtype='object')
after extend multi, total interactions: 1146735, qs: 536, cs: 94, seqnum: 29988
total num: 29988, train+valid num: 14994, test num: 14994
fold: 1, start: 0, end: 2999, total num: 14994
fold: 2, start: 2999, end: 5998, total num: 14994
fold: 3, start: 5998, end: 8997, total num: 14994
fold: 4, start: 8997, end: 11996, total num: 14994
fold: 5, start: 11996, end: 14994, total num: 14994
train+valid original interactions num: 571312, select num: 0, qs: 512, cs: 92, seqnum: 14994
dropnum: 4
train+valid sequences interactions num: 571308, select num: 571308, qs: 512, cs: 92, seqnum: 15527
dropnum: 7
original test interactions num: 575423, select num: 0, qs: 536, cs: 94, seqnum: 14994
test sequences interactions num: 575416, select num: 575416, q

KeyboardInterrupt: 

In [9]:
result_dicts

NameError: name 'result_dicts' is not defined