# Imports

In [1]:
import os
import glob
import pandas as pd

In [2]:
# Constantes
KT1_DIR = './KT1/'
KT3_DIR = './KT3/'

# KT3 dataset

In [3]:
# Obtener la lista de todos los archivos CSV en la carpeta
csv_files = glob.glob(os.path.join(KT3_DIR, "*.csv"))

dfs = []
for file in csv_files:
    user_id = os.path.basename(file).split('.')[0]  # 'uXXXXXX.csv' -> 'uXXXXXX'
    df_temp = pd.read_csv(file)
    df_temp['user_id'] = user_id
    dfs.append(df_temp)

kt3_df = pd.concat(dfs, ignore_index=True)
kt3_df

Unnamed: 0,timestamp,action_type,item_id,source,user_answer,platform,user_id
0,1567396083165,enter,b921,sprint,,web,u10030
1,1567396097400,respond,q921,sprint,c,web,u10030
2,1567396102402,respond,q921,sprint,a,web,u10030
3,1567396103341,submit,b921,sprint,,web,u10030
4,1567396104638,enter,e921,sprint,,web,u10030
...,...,...,...,...,...,...,...
2662547,1562669592456,respond,q6641,sprint,a,web,u996
2662548,1562669622801,respond,q6642,sprint,c,web,u996
2662549,1562669644966,respond,q6643,sprint,c,web,u996
2662550,1562669675599,respond,q6644,sprint,a,web,u996


In [4]:
# Crear una nueva columna 'content_type' que contenga solo la letra inicial de 'item_id'
kt3_df['content_type'] = kt3_df['item_id'].str[0]

kt3_df_filtered = kt3_df[kt3_df['content_type'].isin(['q', 'l', 'e'])]
kt3_df_filtered = kt3_df_filtered[~((kt3_df_filtered['content_type'] == 'e') & (kt3_df_filtered['action_type'] != 'quit'))]

kt3_df_filtered['content_type'].value_counts()

content_type
q    687162
e    471299
l     35448
Name: count, dtype: int64

In [5]:
# Cargar el archivo questions.csv en un DataFrame
questions_df = pd.read_csv('./contents/questions.csv')
questions_df = questions_df[['question_id', 'bundle_id', 'correct_answer']]
questions_df

Unnamed: 0,question_id,bundle_id,correct_answer
0,q1,b1,b
1,q2,b2,a
2,q3,b3,b
3,q4,b4,b
4,q5,b5,c
...,...,...,...
13164,q18139,b12202,b
13165,q18140,b12203,a
13166,q18141,b12204,a
13167,q18142,b12205,a


In [6]:
kt3_q_df = pd.merge(
    kt3_df_filtered, questions_df, left_on="item_id", right_on="question_id", how="left"
)
kt3_q_df = kt3_q_df.drop(columns=["question_id", "action_type", "source", "platform"])
kt3_q_df

Unnamed: 0,timestamp,item_id,user_answer,user_id,content_type,bundle_id,correct_answer
0,1567396097400,q921,c,u10030,q,b921,b
1,1567396102402,q921,a,u10030,q,b921,b
2,1567396112987,e921,,u10030,e,,
3,1567396132608,q1240,c,u10030,q,b1240,b
4,1567396154846,q589,b,u10030,q,b589,a
...,...,...,...,...,...,...,...
1193904,1562669558978,e5194,,u996,e,,
1193905,1562669592456,q6641,a,u996,q,b5107,a
1193906,1562669622801,q6642,c,u996,q,b5107,c
1193907,1562669644966,q6643,c,u996,q,b5107,b


In [7]:
kt3_q_df.to_csv('./kt3_df.csv', index=False)

# KT1 dataset

In [8]:
user_ids = kt3_q_df['user_id'].unique()

kt1_dfs = []
for file in os.listdir(KT1_DIR):
    # Extraer el user_id del nombre del archivo
    user_id = os.path.splitext(file)[0]  
    
    if user_id in user_ids:
        file_path = os.path.join(KT1_DIR, file)
        temp_df = pd.read_csv(file_path)
        temp_df['user_id'] = user_id
        
        kt1_dfs.append(temp_df)

kt1_df = pd.concat(kt1_dfs, ignore_index=True)
kt1_df

Unnamed: 0,timestamp,solving_id,question_id,user_answer,elapsed_time,user_id
0,1563797068117,1,q6525,b,22000,u10030
1,1563797093153,2,q320,b,19000,u10030
2,1563797107951,3,q5659,b,12000,u10030
3,1563797123524,4,q4842,d,13000,u10030
4,1563797136092,5,q3884,a,10000,u10030
...,...,...,...,...,...,...
941949,1562654085927,169,q1588,c,26666,u996
941950,1562654085928,169,q1589,d,26666,u996
941951,1562654173463,170,q2515,c,25333,u996
941952,1562654173503,170,q2514,a,25333,u996


In [9]:
kt1_df.to_csv('./kt1_df.csv', index=False)