# Setting Up the EB-NeRD Notebook for Custom MIND Extension

In [133]:
# Follow the instructions here to download the codebase: https://github.com/ebanalyse/ebnerd-benchmark
# Register here to download the datasets: https://recsys.eb.dk/#registration
# Download: ebnerd_demo, ebnerd_large, ebnerd_small

from pathlib import Path
import polars as pl

from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    truncate_history
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_USER_COL
)

## Specify Path/Folder and Load Parquet Files:

In [134]:
# Specify path where you unzipped your dataset (the example here uses the demo dataset)
root = "C:/demo"
PATH = Path(root)

# Specify filename of the MIND-format TSV for the export
filename = "behavior.tsv"

# Each dataset has a train and validation folder, specify here which one you pick
data_split = "train"

In [135]:
# We only look at behaviors (impressions) and history files, with the goal of combining them to the MIND format
df_behaviors = pl.scan_parquet(PATH.joinpath(data_split, "behaviors.parquet"))
df_history = pl.scan_parquet(PATH.joinpath(data_split, "history.parquet"))

## Merge Behavior and History Files

In [136]:
# Extract user ID and history columns
df_history = df_history.select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)

""" # Uncomment to truncate history in case the resulting file gets too big
df_history = df_history.select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL).pipe(
    truncate_history,
    column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_size=30,
    padding_value=0,
    enable_warning=False,
) """

# Join history to behavior file on user ID
df = slice_join_dataframes(
    df1=df_behaviors.collect(),
    df2=df_history.collect(),
    on=DEFAULT_USER_COL,
    how="left",
)

df.head(5)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,article_id_fixed
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32,list[i32]
48401,,2023-05-21 21:06:50,21.0,,2,"[9774516, 9771051, … 9759966]",[9759966],22779,False,,,,False,21,16.0,27.0,"[9738452, 9737521, … 9770541]"
152513,9778745.0,2023-05-24 07:31:26,30.0,100.0,1,"[9778669, 9778736, … 9777397]",[9778661],150224,False,,,,False,298,2.0,48.0,"[9740087, 9741986, … 9735909]"
155390,,2023-05-24 07:30:33,45.0,,1,"[9778369, 9777856, … 9778448]",[9777856],160892,False,,,,False,401,215.0,100.0,"[9738557, 9738211, … 9770178]"
214679,,2023-05-23 05:25:40,33.0,,2,"[9776715, 9776406, … 9776855]",[9776566],1001055,False,,,,False,1357,40.0,47.0,"[9738777, 9738663, … 9769981]"
214681,,2023-05-23 05:31:54,21.0,,2,"[9775202, 9776855, … 9776570]",[9776553],1001055,False,,,,False,1358,5.0,49.0,"[9738777, 9738663, … 9769981]"


In [137]:

# Select only the columns that match MIND, i.e., user ID, timestamp, history, and impressions (inview + clicked)
mind = df.select(["user_id", "impression_time", "article_id_fixed", "article_ids_inview", "article_ids_clicked"])

# Add row index
mind_indexed = mind.with_row_index("id", offset=1)

# Flatten lists as TSV does not support nested structures
mind_flat = mind_indexed.with_columns(pl.col("article_id_fixed").cast(pl.List(pl.Utf8)).list.join(", "))
mind_flat = mind_flat.with_columns(pl.col("article_ids_inview").cast(pl.List(pl.Utf8)).list.join(", "))
mind_flat = mind_flat.with_columns(pl.col("article_ids_clicked").cast(pl.List(pl.Utf8)).list.join(", "))

mind_flat.head(5)


id,user_id,impression_time,article_id_fixed,article_ids_inview,article_ids_clicked
u32,u32,datetime[μs],str,str,str
1,22779,2023-05-21 21:06:50,"""9738452, 97375…","""9774516, 97710…","""9759966"""
2,150224,2023-05-24 07:31:26,"""9740087, 97419…","""9778669, 97787…","""9778661"""
3,160892,2023-05-24 07:30:33,"""9738557, 97382…","""9778369, 97778…","""9777856"""
4,1001055,2023-05-23 05:25:40,"""9738777, 97386…","""9776715, 97764…","""9776566"""
5,1001055,2023-05-23 05:31:54,"""9738777, 97386…","""9775202, 97768…","""9776553"""


In [139]:
# Reformatted dataframe with the exact column count to match MIND
mind_update = mind_flat.select(pl.exclude("article_ids_clicked"))

index = 0

# Reformat impression column with "-0" for inview items and "-1" for clicked items
for record in mind_flat.iter_rows(named=True):

  # List of updated inview items with "-0" and "-1" suffixes
  reformatted = ""

  # Clicked inview items (each row can have multiple entries, hence the array)
  clicked = record["article_ids_clicked"].split(', ')

  count = 0

  # Update the inview values for each row by...
  for entry in record["article_ids_inview"].split(', '):

    suffix = "-0"

    # ...adding "-1" suffix to clicked inview view items or...
    for interaction in clicked:
      if (entry == interaction):
        suffix = "-1"
    
    # ...the default "-0" suffix for items the user has not interacted with
    entry = entry + suffix

    # Add the reformatted values to the updated MIND dataframe
    if (count != 0):
      reformatted = reformatted + ", " + entry
    else:
      reformatted = reformatted + entry
      count = count + 1

    mind_update[index, "article_ids_inview"] = reformatted

  index = index + 1

print(mind_update)

shape: (24_724, 5)
┌───────┬─────────┬─────────────────────┬────────────────────────────┬───────────────────────┐
│ id    ┆ user_id ┆ impression_time     ┆ article_id_fixed           ┆ article_ids_inview    │
│ ---   ┆ ---     ┆ ---                 ┆ ---                        ┆ ---                   │
│ u32   ┆ u32     ┆ datetime[μs]        ┆ str                        ┆ str                   │
╞═══════╪═════════╪═════════════════════╪════════════════════════════╪═══════════════════════╡
│ 1     ┆ 22779   ┆ 2023-05-21 21:06:50 ┆ 9738452, 9737521, 9738760, ┆ 9774516-0, 9771051-0, │
│       ┆         ┆                     ┆ 97337…                     ┆ 9770028-0,…           │
│ 2     ┆ 150224  ┆ 2023-05-24 07:31:26 ┆ 9740087, 9741986, 9740591, ┆ 9778669-0, 9778736-0, │
│       ┆         ┆                     ┆ 97418…                     ┆ 9778623-0,…           │
│ 3     ┆ 160892  ┆ 2023-05-24 07:30:33 ┆ 9738557, 9738211, 9736646, ┆ 9778369-0, 9777856-1, │
│       ┆         ┆            

In [140]:
# Export to TSV file
target = Path(root + "/" + filename)
mind_update.write_csv(target, include_header = False, separator="\t")
print(target)


C:\demo\behavior.tsv
