In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os
import glob



#scikit-learn, numpy, scipy, matplotlib, seaborn
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


In [6]:
#Define folder path and load files
folder_path = "EdNet-KT1/KT1/"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
print(f"Found {len(csv_files)} files.")

# Load the first file and show the columns + shape
sample = pd.read_csv(csv_files[0])
print("Columns:", sample.columns.tolist())
print("Shape:", sample.shape)
sample.head()

Found 784305 files.
Columns: ['timestamp', 'solving_id', 'question_id', 'user_answer', 'elapsed_time']
Shape: (1082, 5)


Unnamed: 0,timestamp,solving_id,question_id,user_answer,elapsed_time
0,1565096190868,1,q5012,b,38000
1,1565096221062,2,q4706,c,24000
2,1565096293432,3,q4366,b,68000
3,1565096339668,4,q4829,a,42000
4,1565096401774,5,q6528,b,59000


In [None]:
# Grab the first file
folder_path = "EdNet-KT1/KT1/"
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))



print(f"Found {len(csv_files)} files.")
sample_file = csv_files[0]
print("Loading file:", sample_file)

# Load and display
df = pd.read_csv(sample_file)
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head(10)

Found 784305 files.
Loading file: EdNet-KT1/KT1\u1.csv
Shape: (1082, 5)
Columns: ['timestamp', 'solving_id', 'question_id', 'user_answer', 'elapsed_time']


Unnamed: 0,timestamp,solving_id,question_id,user_answer,elapsed_time
0,1565096190868,1,q5012,b,38000
1,1565096221062,2,q4706,c,24000
2,1565096293432,3,q4366,b,68000
3,1565096339668,4,q4829,a,42000
4,1565096401774,5,q6528,b,59000
5,1565096463370,6,q4793,a,58000
6,1565096501746,7,q6488,a,35000
7,1565097101361,8,q356,b,23000
8,1565097171393,9,q1382,c,22000
9,1565097240758,10,q830,b,25000


In [10]:


#limit for testing
csv_files = csv_files[:5000]  # Just try 5000 files first (for testing only)

#load into dataframe
data_frames = []

for i, file in enumerate(csv_files):
    try:
        df = pd.read_csv(file)

        # Skip empty files
        if df.shape[0] == 0:
            continue

        # Drop rows with missing elapsed_time
        df.dropna(subset=["elapsed_time"], inplace=True)

        # Filter out long elapsed times (> 30 minutes)
        df = df[df["elapsed_time"] <= 1800000]

        # Convert timestamp
        df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")

        # Add file name as proxy for user_id
        df["user_id"] = os.path.basename(file).split(".")[0]

        data_frames.append(df)

        if i % 500 == 0:
            print(f"Processed {i} files")

    except Exception as e:
        print(f"Error in {file}: {e}")
        continue
    
# Combine into one DataFrame
if data_frames:
    all_data = pd.concat(data_frames, ignore_index=True)
    print(f"Total rows collected: {len(all_data)}")
    all_data.to_parquet("KT1_cleaned_sample.parquet")
else:
    print("No valid data collected.")


Processed 0 files
Processed 500 files
Processed 1000 files
Processed 1500 files
Processed 2000 files
Processed 2500 files
Processed 3000 files
Processed 3500 files
Processed 4000 files
Processed 4500 files
Total rows collected: 1402362


In [11]:
# Summary statistics
print("\nPreview of cleaned data:")
print(all_data.head())
print("\nColumn names:", all_data.columns.tolist())
print("Total unique users:", all_data['user_id'].nunique())


Preview of cleaned data:
                timestamp  solving_id question_id user_answer  elapsed_time  \
0 2019-08-06 12:56:30.868           1       q5012           b         38000   
1 2019-08-06 12:57:01.062           2       q4706           c         24000   
2 2019-08-06 12:58:13.432           3       q4366           b         68000   
3 2019-08-06 12:58:59.668           4       q4829           a         42000   
4 2019-08-06 13:00:01.774           5       q6528           b         59000   

  user_id  
0      u1  
1      u1  
2      u1  
3      u1  
4      u1  

Column names: ['timestamp', 'solving_id', 'question_id', 'user_answer', 'elapsed_time', 'user_id']
Total unique users: 5000


In [12]:
# Sample feature engineering
temp_summary = all_data.groupby("user_id").agg({
    "elapsed_time": "mean",
    "question_id": "count"
}).rename(columns={
    "elapsed_time": "avg_response_time",
    "question_id": "num_questions"
}).reset_index()

print("\nUser summary sample:")
print(temp_summary.head())


User summary sample:
  user_id  avg_response_time  num_questions
0      u1       49578.441774           1082
1     u10       29125.000000             16
2    u100       33151.484848             33
3   u1000       41128.524194           1488
4  u10000       36254.320988            405
