In [2]:
import pandas as pd

# Load datasets
logs = pd.read_csv("logs.csv")
sessions = pd.read_csv("sessions.csv")
questionnaires = pd.read_csv("questionnaires.csv")

logs.head(), sessions.head(), questionnaires.head()

(   log_id  session_id            timestamp action_type     element_id  \
 0       1        1071  2025-01-01 00:36:23  drill_down    chart_trend   
 1       2         504  2025-01-01 00:31:53       hover      KPI_costs   
 2       3        1046  2025-01-01 00:15:47  drill_down    chart_trend   
 3       4         213  2025-01-01 00:44:35      filter      KPI_costs   
 4       5         310  2025-01-01 00:15:16       click  table_details   
 
    duration_sec  error_flag  decision_made  
 0      1.000000           0              1  
 1     23.347078           0              1  
 2      1.000000           0              1  
 3     15.594869           0              1  
 4      2.166529           0              0  ,
    session_id  user_id interface_version
 0           1      103                 B
 1           2       93                 B
 2           3       15                 A
 3           4      107                 B
 4           5       72                 B,
    user_id  SUS_score  

In [3]:
# Quick structure checks
print("logs shape:", logs.shape)
print("sessions shape:", sessions.shape)
print("questionnaires shape:", questionnaires.shape)

print("\nlogs dtypes:\n", logs.dtypes)
print("\nsessions dtypes:\n", sessions.dtypes)
print("\nquestionnaires dtypes:\n", questionnaires.dtypes)

logs shape: (25000, 8)
sessions shape: (1200, 3)
questionnaires shape: (150, 7)

logs dtypes:
 log_id             int64
session_id         int64
timestamp         object
action_type       object
element_id        object
duration_sec     float64
error_flag         int64
decision_made      int64
dtype: object

sessions dtypes:
 session_id            int64
user_id               int64
interface_version    object
dtype: object

questionnaires dtypes:
 user_id              int64
SUS_score          float64
NASA_TLX           float64
UES_engagement     float64
IMI_autonomy       float64
IMI_competence     float64
intention_reuse    float64
dtype: object


In [4]:
# Inspect first rows for a quick look at content
logs.head(), sessions.head(), questionnaires.head()

(   log_id  session_id            timestamp action_type     element_id  \
 0       1        1071  2025-01-01 00:36:23  drill_down    chart_trend   
 1       2         504  2025-01-01 00:31:53       hover      KPI_costs   
 2       3        1046  2025-01-01 00:15:47  drill_down    chart_trend   
 3       4         213  2025-01-01 00:44:35      filter      KPI_costs   
 4       5         310  2025-01-01 00:15:16       click  table_details   
 
    duration_sec  error_flag  decision_made  
 0      1.000000           0              1  
 1     23.347078           0              1  
 2      1.000000           0              1  
 3     15.594869           0              1  
 4      2.166529           0              0  ,
    session_id  user_id interface_version
 0           1      103                 B
 1           2       93                 B
 2           3       15                 A
 3           4      107                 B
 4           5       72                 B,
    user_id  SUS_score  

In [6]:
# Identify potential join keys by intersecting column names
logs_cols = set(logs.columns)
sessions_cols = set(sessions.columns)
questionnaires_cols = set(questionnaires.columns)

common_logs_sessions = logs_cols & sessions_cols
common_logs_questionnaires = logs_cols & questionnaires_cols
common_sessions_questionnaires = sessions_cols & questionnaires_cols

print("Common columns (logs ∩ sessions):", common_logs_sessions)
print("Common columns (logs ∩ questionnaires):", common_logs_questionnaires)
print("Common columns (sessions ∩ questionnaires):", common_sessions_questionnaires)

# After seeing the output, you can decide which of these are true join keys
# (e.g., 'session_id', 'user_id', 'condition').

Common columns (logs ∩ sessions): {'session_id'}
Common columns (logs ∩ questionnaires): set()
Common columns (sessions ∩ questionnaires): {'user_id'}


In [None]:
# Basic data quality check: missing values per column
print("Missing values in logs:\n", logs.isna().sum(), "\n")
print("Missing values in sessions:\n", sessions.isna().sum(), "\n")
print("Missing values in questionnaires:\n", questionnaires.isna().sum(), "\n")

## Mini data schema and variable types

| Dataset          | Granularity          | Key columns                 | Main variable types                             |
|------------------|----------------------|-----------------------------|-------------------------------------------------|
| `logs`           | Per action / event   | `log_id`, `session_id`      | **Behavioral**: `timestamp`, `action_type`, `element_id`, `duration_sec`, `error_flag`, `decision_made` |
| `sessions`       | Per session          | `session_id`, `user_id`     | **Experimental / metadata**: `interface_version`; **link** between logs and questionnaires |
| `questionnaires` | Per user             | `user_id`                   | **Subjective** UX measures: `SUS_score`, `NASA_TLX`, `UES_engagement`, `IMI_autonomy`, `IMI_competence`, `intention_reuse` |

**Subjective variables (questionnaires)**: all scores capturing perceived usability, workload, engagement, autonomy, competence, and intention to reuse.

**Behavioral variables (logs-based)**: all columns in `logs` that describe what the user did and how (actions, elements, timing, errors, decisions).

**Experimental variables (conditions)**: `interface_version` (A vs B) in `sessions`, plus any additional condition/group variables you might add later.

### Obvious data issues (to refine later)

Run the missing-value cell above and note here any problems you see (e.g., many missing questionnaire rows for some users, sessions without logs, logs without matching sessions). At first glance, based on shapes and dtypes, there is no obvious structural problem, but you should revisit this section after deeper checks (e.g., joins, distributions, outliers).