# Data Download and Extraction

In [1]:
import os
import zipfile
import urllib.request
from typing import Literal


def get_url(small=True, set_type=Literal['train', 'dev', 'test']):
    if set_type == 'test' and small:
        raise ValueError("Small test set is not available.")
    return f"https://huggingface.co/datasets/yjw1029/MIND/resolve/main/MIND{"small" if small else "large"}_{set_type}.zip"

def download_and_extract_zip(url, extract_to='./data'):
    filename = url.split('/')[-1]
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")
    else:
        print(f"{filename} already exists.")

    if filename.endswith('.zip'):
        print(f"Extracting {filename}...")
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Extraction complete.")

    # Clean up the zip file
    os.remove(filename)

    return extract_to

def download_dataset(small=True, ignore_cache=False):
    set_types = ['train', 'dev']
    if not small:
        set_types.append('test')

    for set_type in set_types:
        url = get_url(small=small, set_type=set_type)
        path = f'./data/{"small" if small else "large"}/{set_type}'
        if ignore_cache or not os.path.exists(path):
            download_and_extract_zip(url, extract_to=f'./data/{"small" if small else "large"}/{set_type}')
        else:
            print(f"{set_type} set already exists at {path}, skipping download.")

In [2]:
download_dataset(small=False)

train set already exists at ./data/large/train, skipping download.
dev set already exists at ./data/large/dev, skipping download.
test set already exists at ./data/large/test, skipping download.


# Load and aggregate data files
The goal is to take the raw data files and create 3 csv files: user_features.csv, item_features.csv, interactions.csv

Right now, we have:
- `behaviors.tsv`: The click histories and impression logs of users
- `news.tsv`: The information of news articles
- `entity_embedding.vec`: The embeddings of entities in news extracted from knowledge graph. We might use this later. For now, we will just use the text information in the news articles.
- `relation_embedding.vec`: The embeddings of relations between entities extracted from knowledge graph. In our case, we won't be using this file. This is more useful when we want to use graph neural networks over the knowledge graph.

So the plan is to first create the item features (since the user features will depend on the item features as we will see later), then create the user features, and finally create the interactions file.

## Item Features
Right now, in `news.tsv` we have the following columns: News ID, Category, SubCategory, Title, Abstract, URL (many have expired by now), Title Entities (entities contained in the title of this news), Abstract Entities (entites contained in the abstract of this news).

For now, we won't try retrieving the content of the news articles using the URLs (or finding them through web search) since that would be too time consuming and is likely not necessary to showcase the benefits of the Two-Tower model. In a real world scenario, it would be ideal to retrieve the full text of the articles to get better representations.

Here is how we will initially create the item features:
- Combine the title and abstract into a single text field then use MiniLM to create text embeddings for each news article
- Use the subcategory (not category as well since redundant) as learnable embeddings during model training (better than one-hot encoding since there are many subcategories)
- Potential future improvement: Use the weighted average of the entity embeddings (from `entity_embedding.vec`) as another set of features

In [3]:
import pandas as pd

train_path = './data/large/train/MINDlarge_train/'
val_path = './data/large/dev/MINDlarge_dev/'

def get_news_df(path):
    return pd.read_csv(path + 'news.tsv', sep='\t', header=None, names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])

df_news_train = get_news_df(train_path)
df_news_val = get_news_df(val_path)
df_news_train

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
...,...,...,...,...,...,...,...,...
101522,N115249,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
101523,N64337,finance,finance-real-estate,Mansion Monday: Contemporary Des Moines home i...,Among the perks of this unique Des Moines home...,https://assets.msn.com/labs/mind/BBWzQq8.html,"[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W...","[{""Label"": ""Des Moines, Iowa"", ""Type"": ""G"", ""W..."
101524,N100102,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]
101525,N74617,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [4]:
def get_entities_df(path):
    df_ = pd.read_csv(path + 'entity_embedding.vec', sep='\t', index_col=0, header=None)
    df_.index.name = 'entity_id'
    return df_

df_entities_train = get_entities_df(train_path)
df_entities_val = get_entities_df(val_path)
df_entities_train

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,92,93,94,95,96,97,98,99,100,101
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q34433,0.017808,-0.073256,0.102521,-0.059926,-0.060665,0.027027,-0.091728,-0.003057,-0.170798,0.111819,...,0.013433,0.045894,-0.071253,0.086445,-0.120466,0.059235,-0.071865,0.058854,0.024765,
Q41,-0.063388,-0.181451,0.057501,-0.091254,-0.076217,-0.052525,0.050500,-0.224871,-0.018145,0.030722,...,0.001861,0.124535,-0.151043,-0.263698,-0.103607,0.020007,-0.101157,-0.091567,0.035234,
Q56037,0.021550,-0.044888,-0.027872,-0.128843,0.066651,-0.072159,0.019879,-0.183956,0.080640,0.069166,...,0.091477,0.091388,-0.027993,-0.112258,-0.231887,0.095612,-0.008997,-0.157394,0.088364,
Q1860,0.060958,0.069934,0.015832,0.079471,-0.023362,-0.125007,-0.043618,0.134063,-0.121691,0.089166,...,-0.014287,0.013578,0.099977,0.012199,-0.141138,0.056129,-0.133727,0.025795,0.051448,
Q7737,-0.021237,0.176011,-0.078886,0.041470,0.136488,-0.063177,-0.013134,-0.048977,-0.072779,0.062055,...,-0.162670,-0.101399,0.039757,0.123693,-0.129503,0.153088,-0.014275,-0.113625,-0.097839,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q54860790,0.034682,-0.009413,-0.024317,0.073895,0.028052,0.028039,0.039260,0.017398,0.017743,-0.007708,...,0.034473,0.035736,0.008329,-0.049981,-0.025212,-0.018404,0.004110,0.013771,-0.008027,
Q54861457,-0.074425,-0.042263,-0.009502,0.033632,-0.092861,-0.067328,-0.009075,0.048518,-0.049856,-0.020152,...,0.077567,0.055743,-0.035487,0.041439,-0.016274,0.031114,-0.001480,-0.057676,-0.018141,
Q54862508,-0.052323,-0.078029,-0.060925,-0.052536,0.006802,-0.070488,-0.081736,0.026385,-0.037127,0.057764,...,0.045298,0.009842,-0.019821,-0.033952,-0.047436,0.062752,0.043236,0.032251,-0.001261,
Q54866839,0.015159,0.021187,0.059618,-0.007465,-0.038469,0.047728,-0.049030,0.070997,-0.063303,-0.008217,...,0.013430,-0.028223,-0.021695,0.025361,-0.003214,0.020141,-0.028943,-0.046590,-0.027286,


In [5]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cuda


In [6]:
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

In [None]:
df_items_features_train = df_news_train.copy()
df_items_features_val = df_news_val.copy()

def create_df_items_features(df_news):
    df_items_features = df_news.copy()

    # Combine title and abstract
    texts = (df_news['title'].fillna('') + ' ' + df_news['abstract'].fillna('')).str.strip()

    # Create text embeddings
    embeddings = model.encode(texts.tolist(), show_progress_bar=True, normalize_embeddings=True, batch_size=512, device=device)
    embedding_cols = [f"text_embedding_{i}" for i in range(embeddings.shape[1])]
    embeddings_df = pd.DataFrame(
        embeddings,
        index=df_news.index,
        columns=embedding_cols
    )
    df_items_features = pd.concat([df_items_features, embeddings_df], axis=1)

    # Make the subcategory an int (this will be used as learnable embeddings during model training)
    df_items_features['subcategory'] = df_items_features['subcategory'].astype('category').cat.codes

    # Keep only relevant columns
    df_items_features = df_items_features[['news_id', 'subcategory'] + embedding_cols]

    return df_items_features

df_items_features_train = create_df_items_features(df_news_train)
df_items_features_val = create_df_items_features(df_news_val)

Batches:   0%|          | 0/199 [00:00<?, ?it/s]

Batches:   0%|          | 0/141 [00:00<?, ?it/s]

In [10]:
df_items_features_train

Unnamed: 0,news_id,subcategory,text_embedding_0,text_embedding_1,text_embedding_2,text_embedding_3,text_embedding_4,text_embedding_5,text_embedding_6,text_embedding_7,...,text_embedding_374,text_embedding_375,text_embedding_376,text_embedding_377,text_embedding_378,text_embedding_379,text_embedding_380,text_embedding_381,text_embedding_382,text_embedding_383
0,N88753,150,-0.026685,0.085729,0.054504,0.004929,0.066577,0.036270,0.025842,-0.059881,...,-0.025451,-0.042500,0.011001,0.047129,-0.023788,-0.017850,0.038645,-0.098548,0.044370,-0.019333
1,N45436,196,0.008810,-0.062548,0.036452,0.001428,0.048341,0.027927,-0.106606,0.000490,...,0.014079,-0.086675,-0.069874,-0.049767,-0.030836,0.000199,0.141212,-0.096237,0.042062,0.059672
2,N23144,280,0.028438,0.033090,0.062954,0.095190,0.006079,-0.011566,0.007766,-0.001489,...,0.011645,0.003554,-0.008901,-0.045818,-0.034742,0.080959,0.062710,-0.008410,-0.029259,-0.018737
3,N86255,160,0.013335,0.075702,0.010068,-0.032163,-0.006110,-0.029779,0.080685,-0.001943,...,0.033077,0.027680,-0.002254,-0.020419,-0.111510,-0.115437,0.006116,-0.046950,0.042515,0.019368
4,N93187,204,-0.020725,0.120532,0.044878,0.040875,0.067311,-0.044708,0.038347,0.044914,...,0.033428,-0.051004,0.061394,-0.028540,-0.040449,-0.007994,-0.076474,-0.088807,-0.000027,0.013815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101522,N115249,166,0.069134,0.070063,0.053358,-0.127729,0.078482,0.079536,-0.049262,0.000635,...,-0.002426,0.007300,0.018272,0.053605,-0.033054,-0.043165,-0.042101,-0.059410,0.063675,0.018943
101523,N64337,84,0.089047,0.052699,0.074834,-0.065976,-0.052298,0.023899,-0.078924,-0.004321,...,-0.011336,0.038797,-0.085335,-0.083584,0.022155,-0.007747,0.028578,0.050396,-0.046023,-0.015477
101524,N100102,237,-0.029745,0.014321,-0.009105,-0.004291,0.015084,0.065064,0.063336,0.018011,...,0.057523,-0.035574,-0.064579,0.063250,-0.047341,0.019596,0.000821,0.042760,0.034391,0.037697
101525,N74617,22,0.005110,0.027955,0.009607,-0.018577,-0.031110,0.035198,-0.019995,0.020659,...,-0.001259,0.038549,-0.055825,-0.032130,-0.038240,-0.041084,0.012207,-0.116462,0.026692,0.082489


## User Features
In `behaviors.tsv` we have the following columns: Impressions ID, User ID, Time, History (ordered clicked news IDs separated by space), Impressions (the list of news IDs shown to the user with click labels).

The way I understand it is that each row corresponds to a single session that starts at <Time> for <User ID>. During this session, the user was shown the list of news articles in <Impressions> and clicked on some of them (indicated by the label 1). The <History> column contains the news articles that the user had clicked on in the past.

These impressions are those made by randomly selected users during the 5th week of the MIND dataset collection period. The history contains the clicked news articles from the first 4 weeks. This means that a user will have the same history for all impressions in week 5 even if in theory, their history technically evolves each time they read an article in week 5. To stay faithful to the dataset, we will not augment the history with clicks made in week 5.

Here is how we will create the user features:
- **History-based text embeddings with prior**: For each user, we will take the news articles in their history, get the text embeddings from the item features we created earlier, and average them to create a single text embedding representing the user's interests. To avoid issues with users who have very short histories or no histories, we will add a prior embedding which is the average embedding of all news articles in the dataset. This way, even if a user has only clicked on one article, their embedding will still contain some general information about the news domain. The more articles a user has in their history, the less influence the prior embedding will have. The formula is as follows:
\begin{align*}
\text{history\_embedding} = \frac{\lambda \cdot \text{prior\_embedding} + \sum_{i=1}^{N} \text{article\_embedding}_i}{\lambda + N}
\end{align*}
where $N$ is the number of articles in the user's history.
- **History-length ($N$)**: The number of articles in the user's history scaled and normalized. This gives the model a sense of how active the user is. This can also help the model distinguish between users with very short histories and those with long histories but that read articles around the global prior.
- That's it (unfortunately we don't have more demographic information about the users in this dataset). Ideally, we would want things such as age, gender, location, reading patterns, etc, etc. However, as with many public datasets, this information is not available due to privacy concerns.

In [12]:
def load_behaviors(path):
    df_ = pd.read_csv(path + 'behaviors.tsv', sep='\t', header=None, names=['impression_id', 'user_id', 'time', 'history', 'impressions'])
    df_['time'] = pd.to_datetime(df_['time'])
    return df_

df_behaviors_train = load_behaviors(train_path)
df_behaviors_val = load_behaviors(val_path)
df_behaviors_train

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U87243,2019-11-10 11:30:54,N8668 N39081 N65259 N79529 N73408 N43615 N2937...,N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...
1,2,U598644,2019-11-12 13:45:29,N56056 N8726 N70353 N67998 N83823 N111108 N107...,N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...
2,3,U532401,2019-11-13 11:23:03,N128643 N87446 N122948 N9375 N82348 N129412 N5...,N103852-0 N53474-0 N127836-0 N47925-1
3,4,U593596,2019-11-12 12:24:09,N31043 N39592 N4104 N8223 N114581 N92747 N1207...,N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...
4,5,U239687,2019-11-14 20:03:01,N65250 N122359 N71723 N53796 N41663 N41484 N11...,N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3...
...,...,...,...,...,...
2232743,2232744,U316192,2019-11-13 18:50:02,N122359 N37069 N95876 N28787 N73408 N11266 N61321,N113723-0 N123683-1 N5287-0 N76677-0 N53474-0
2232744,2232745,U451238,2019-11-12 08:54:06,N12575 N93816 N71643 N87236 N87236,N18861-0 N20990-0 N43085-0 N7937-1
2232745,2232746,U151246,2019-11-13 12:42:51,N27587 N49668,N39887-1 N22811-0 N110709-1 N1923-0 N24001-1 N...
2232746,2232747,U330725,2019-11-12 13:22:57,N121944 N91510 N42280 N60061 N63032 N125223 N4...,N18947-0 N88808-1 N10012-0 N38902-0 N33078-0 N...


In [13]:
# Inspect a random user's impressions
random_user_id = df_behaviors_train['user_id'].sample(1, random_state=42).values[0]
user_impression = df_behaviors_train[df_behaviors_train['user_id'] == random_user_id]
user_impression

Unnamed: 0,impression_id,user_id,time,history,impressions
32511,32512,U694470,2019-11-14 06:21:29,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N110925-1 N40218-0 N116711-1 N45522-0 N10646-0...
49794,49795,U694470,2019-11-09 17:48:52,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N27703-0 N99362-0 N56201-0 N46320-0 N94279-0 N...
148142,148143,U694470,2019-11-12 04:55:12,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N2750-1 N109344-0 N95261-0 N83412-0 N52079-0 N...
200377,200378,U694470,2019-11-14 13:12:35,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N95301-0 N41523-0 N119621-0 N84391-0 N110603-0...
241011,241012,U694470,2019-11-13 18:07:21,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N127134-0 N88472-0 N42876-0 N113723-0 N24701-0...
287977,287978,U694470,2019-11-11 02:59:54,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N645-0 N24150-0 N30119-0 N48197-0 N101624-0 N3...
305371,305372,U694470,2019-11-11 14:31:22,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N102111-0 N33037-0 N1643-0 N85452-0 N54493-0 N...
328406,328407,U694470,2019-11-14 19:40:42,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N78508-0 N104990-0 N55761-0 N51163-1 N86258-0 ...
500450,500451,U694470,2019-11-12 04:58:33,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N49697-0 N123724-1 N71353-0 N48418-0 N52079-0 ...
599716,599717,U694470,2019-11-10 04:42:26,N3794 N69106 N39081 N92191 N12959 N128643 N996...,N30350-0 N1643-0 N120612-1 N9835-0 N12975-0 N3...


In [19]:
import numpy as np
from tqdm import tqdm

tqdm.pandas()

def create_df_user_features(df_behaviors, df_items_features):
    df_user_features = pd.DataFrame()
    text_embedding_cols = [c for c in df_items_features.columns 
                           if c.startswith("text_embedding_")]

    # Precompute prior embedding
    embedding_matrix = df_items_features[text_embedding_cols].to_numpy()
    prior_embedding = embedding_matrix.mean(axis=0)

    # Hash map of embeddings for faster lookup
    item_embedding_map = dict(zip(df_items_features['news_id'], list(range(len(df_items_features)))))

    def compute_user_embedding(history, lambda_prior=5.0):
        if pd.isna(history) or history.strip() == '':
            return prior_embedding
        history_ids = history.split()
        history_embeddings_idx = [item_embedding_map[news_id] for news_id in history_ids if news_id in item_embedding_map]
        if len(history_embeddings_idx) != len(history_ids):
            print(f"Warning: Some history IDs not found in item embeddings: {set(history_ids) - set(item_embedding_map.keys())}")
        N = len(history_embeddings_idx)  # even if 0, the following formula works
        history_embedding_sum = embedding_matrix[history_embeddings_idx].sum(axis=0)
        history_embedding = (lambda_prior * prior_embedding + history_embedding_sum) / (lambda_prior + N)
        return history_embedding

    df_user_features['user_id'] = df_behaviors['user_id'].unique()
    df_user_features = df_user_features.merge(
        df_behaviors[['user_id', 'history']].drop_duplicates().set_index('user_id'),
        left_on='user_id', right_index=True
    )
    history_embedding_cols = [f'history_embedding_{i}' for i in range(len(text_embedding_cols))]
    history_embeddings = df_user_features['history'].progress_apply(compute_user_embedding)
    history_embeddings = np.vstack(history_embeddings.to_numpy())
    history_embeddings = pd.DataFrame(history_embeddings, columns=history_embedding_cols, index=df_user_features.index)
    df_user_features = pd.concat([df_user_features, history_embeddings], axis=1)

    # History length feature
    history_lengths = df_user_features['history'].apply(lambda x: 0 if pd.isna(x) or x.strip() == '' else len(x.split()))
    df_user_features['history_length'] = (history_lengths - history_lengths.mean()) / history_lengths.std()

    df_user_features = df_user_features[['user_id', 'history_length'] + history_embedding_cols]
    return df_user_features

df_user_features_train = create_df_user_features(df_behaviors_train, df_items_features_train)
df_user_features_val = create_df_user_features(df_behaviors_val, df_items_features_val)

100%|██████████| 711222/711222 [00:31<00:00, 22252.37it/s]
100%|██████████| 255990/255990 [00:13<00:00, 19023.25it/s]


In [20]:
df_user_features_train

Unnamed: 0,user_id,history_length,history_embedding_0,history_embedding_1,history_embedding_2,history_embedding_3,history_embedding_4,history_embedding_5,history_embedding_6,history_embedding_7,...,history_embedding_374,history_embedding_375,history_embedding_376,history_embedding_377,history_embedding_378,history_embedding_379,history_embedding_380,history_embedding_381,history_embedding_382,history_embedding_383
0,U87243,-0.113898,-0.020163,0.006853,0.016440,0.011475,0.016079,0.008812,0.013410,-0.013011,...,0.007669,0.004010,-0.001669,0.025207,-0.005907,0.032014,0.014140,-0.021133,-0.000959,0.009830
1,U598644,0.214963,-0.003177,0.012893,0.003563,0.010115,0.035071,0.002344,0.006872,-0.022457,...,0.022486,-0.025163,-0.004867,0.006834,-0.010605,-0.008235,0.027050,-0.001892,-0.006158,0.019890
2,U532401,-0.113898,-0.011405,0.008811,0.019550,0.008812,0.024872,0.006942,0.016555,-0.013089,...,0.017374,-0.014042,-0.009755,0.001579,-0.005928,0.017325,0.042693,-0.016803,0.008381,0.005796
3,U593596,-0.237220,0.005366,0.010071,0.015982,0.004504,0.043494,0.008458,0.020806,-0.006073,...,0.020030,-0.009869,0.001712,-0.014635,-0.011543,0.021738,0.016516,-0.002363,-0.005752,0.021640
4,U239687,13.163839,-0.004183,0.008913,0.010880,0.002959,0.029287,0.004159,0.015215,-0.003005,...,0.023980,-0.001280,-0.012619,-0.011001,-0.015992,0.003918,0.007915,-0.018138,0.005007,0.011059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
711217,U248404,-0.319435,-0.015776,0.012488,0.012697,-0.005593,0.024868,-0.016672,0.020679,-0.017932,...,0.036773,-0.015126,0.009563,-0.005411,-0.012725,-0.002028,0.035155,-0.008244,0.022585,0.002657
711218,U437168,-0.442758,-0.015121,-0.000805,0.024986,0.009593,0.014924,-0.016766,0.009885,0.008681,...,0.031332,0.002291,-0.017647,-0.008343,-0.020513,0.001591,0.002563,-0.039723,0.011032,0.012920
711219,U209837,-0.689403,-0.017339,0.019209,0.001560,0.014451,0.041774,0.007940,0.021513,-0.004246,...,0.026102,-0.011960,-0.007464,-0.013692,-0.005988,0.000485,0.003980,-0.013046,0.008689,0.000225
711220,U501719,-0.524973,-0.002593,-0.016218,0.002364,0.015778,0.038665,0.020278,-0.002308,-0.000113,...,0.025383,-0.002146,-0.014260,-0.014705,-0.011928,-0.006887,0.009587,-0.017565,-0.007816,0.019607


## Interactions
Here, we actually need to create 2 seperate interaction matrix files, one for each model.
### Matrix Factorization model
The first thing to note here is that the vanilla MF model doesn't differentiate between negative interactions (items that were shown but not clicked) and unknown interactions (items that were never shown). This is a limitation of the MF model since in reality, we would want to treat these 2 types of interactions differently. There are variations of MF that can handle this [[1]](http://yifanhu.net/PUB/cf.pdf). However, since the goal is to compare the Two-Tower model against the vanilla MF model, we will stick to the basic MF formulation here.

The second thing is that there might be multiple impressions for the same user and item pair. For example, a user might have been shown the same article multiple times during the week. It is also possible that the user clicked on the article one time but not the other times. However, for the MF model, we can only have a single interaction value for each user-item pair. To resolve this, we will define the interaction value as the ratio of clicks to impressions for that user-item pair. For example, if a user was shown an article 3 times and clicked on it once, the interaction value will be 1/3 = 0.33. If the user never clicked on it, the interaction value will be 0. If the user always clicked on it, the interaction value will be 1.

Finally, saving the interaction matrix in a dense format (i.e., a full matrix with all user-item pairs) would be very inefficient in terms of storage since most entries would be zero (sparse interactions). Therefore, we will save the interaction matrix in a sparse format where we only store the non-zero interactions.

### Two-Tower model
Interactions for the two-tower model are a bit different. Indeed, we want each sample to correspond to a single impression. This impression will have the user ID, the list of news articles shown to the user, and the click labels for each article as a mask.

In [21]:
def get_ctr_labels(impressions_str):
    impressions = impressions_str.split()
    items_to_ctr = {}
    for imp in impressions:
        news_id, label = imp.split('-')
        label = int(label)
        if news_id not in items_to_ctr:
            items_to_ctr[news_id] = [0, 0]  # [clicks, impressions]
        items_to_ctr[news_id][1] += 1
        if label == 1:
            items_to_ctr[news_id][0] += 1

    items_to_ctr = [(news_id, clicks / imps) for news_id, (clicks, imps) in items_to_ctr.items() if clicks > 0]
    return items_to_ctr

def create_mf_interactions(df_behaviors):
    df_mf_interactions = df_behaviors[['user_id', 'impressions']].copy()
    df_mf_interactions = df_mf_interactions.groupby('user_id')['impressions'].apply(lambda x: ' '.join(x)).reset_index()
    df_mf_interactions['impressions'] = df_mf_interactions['impressions'].apply(get_ctr_labels)

    # make each user-item pair a separate row
    df_mf_interactions = df_mf_interactions.explode('impressions')
    df_mf_interactions['news_id'] = df_mf_interactions['impressions'].apply(lambda x: x[0])
    df_mf_interactions['label'] = df_mf_interactions['impressions'].apply(lambda x: x[1])
    df_mf_interactions = df_mf_interactions[['user_id', 'news_id', 'label']]

    return df_mf_interactions

df_mf_interactions_train = create_mf_interactions(df_behaviors_train)
df_mf_interactions_val = create_mf_interactions(df_behaviors_val)

df_mf_interactions_train

Unnamed: 0,user_id,news_id,label
0,U0,N35189,1.000000
0,U0,N38813,1.000000
0,U0,N51163,1.000000
1,U1,N87684,1.000000
1,U1,N89932,0.333333
...,...,...,...
711220,U99998,N110303,0.500000
711220,U99998,N30899,1.000000
711220,U99998,N41315,1.000000
711220,U99998,N66780,1.000000


In [22]:
def create_two_tower_interactions(df_behaviors):
    df_two_tower_interactions = df_behaviors[['user_id', 'impressions']].copy()
    df_two_tower_interactions['impressions'] = df_two_tower_interactions['impressions'].apply(lambda x: [imp.split('-') for imp in x.split()])
    df_two_tower_interactions['news_ids'] = df_two_tower_interactions['impressions'].apply(lambda x: [imp[0] for imp in x])
    df_two_tower_interactions['labels'] = df_two_tower_interactions['impressions'].apply(lambda x: [int(imp[1]) for imp in x])
    df_two_tower_interactions = df_two_tower_interactions[['user_id', 'news_ids', 'labels']]
    return df_two_tower_interactions

df_two_tower_interactions_train = create_two_tower_interactions(df_behaviors_train)
df_two_tower_interactions_val = create_two_tower_interactions(df_behaviors_val)
df_two_tower_interactions_train

Unnamed: 0,user_id,news_ids,labels
0,U87243,"[N78206, N26368, N7578, N58592, N19858, N58258...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, ..."
1,U598644,"[N47996, N82719, N117066, N8491, N123784, N214...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ..."
2,U532401,"[N103852, N53474, N127836, N47925]","[0, 0, 0, 1]"
3,U593596,"[N38902, N76434, N71593, N100073, N108736, N30...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,U239687,"[N76209, N48841, N67937, N62235, N6307, N34254...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
2232743,U316192,"[N113723, N123683, N5287, N76677, N53474]","[0, 1, 0, 0, 0]"
2232744,U451238,"[N18861, N20990, N43085, N7937]","[0, 0, 0, 1]"
2232745,U151246,"[N39887, N22811, N110709, N1923, N24001, N7667...","[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2232746,U330725,"[N18947, N88808, N10012, N38902, N33078, N1003...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [23]:
# Compute the imbalance ratio for each dataset
def compute_imbalance_ratio_mf(df_, num_users, num_items):
    positive = (df_['label'] != 0).sum()
    negative = len(df_) - positive
    unknown = num_users * num_items - len(df_)
    return {'positive': positive, 'negative': negative, 'unknown': unknown,
            'positive_ratio': positive / (positive + negative + unknown),
            'negative_ratio': negative / (positive + negative + unknown),
            'unknown_ratio': unknown / (positive + negative + unknown)}

def compute_imbalance_ratio_two_tower(df_, num_users, num_items):
    # Flatten each column in a vectorized way
    lens = df_['news_ids'].apply(len).to_numpy()
    user_id_flat = np.repeat(df_['user_id'].to_numpy(), lens)
    news_ids_flat = np.concatenate(df_['news_ids'].to_numpy())
    labels_flat   = np.concatenate(df_['labels'].to_numpy())

    flat = pd.DataFrame({
        'user_id':  user_id_flat,
        'news_ids': news_ids_flat,
        'labels':   labels_flat,
    })
    
    # We must first explode the labels/news_ids
    flat = flat.groupby(by=['user_id', 'news_ids']).max().reset_index()

    # Pass it to the mf imbalance ratio function, since it is now in the same format
    return compute_imbalance_ratio_mf(flat.rename(columns={'news_ids': 'news_id', 'labels': 'label'}), num_users, num_items)

def display_imbalance_ratios(df_mf, df_two_tower, df_user_features, df_item_features):
    num_users = df_user_features.shape[0]
    num_items = df_item_features.shape[0]

    # Display imbalance ratios cleanly
    for model_name, df_ in [('MF', df_mf), ('Two-Tower', df_two_tower)]:
        if model_name == 'MF':
            imbalance = compute_imbalance_ratio_mf(df_, num_users, num_items)
        else:
            imbalance = compute_imbalance_ratio_two_tower(df_, num_users, num_items)
        print(f"{model_name} Interactions Imbalance:")
        print(f"  Positive: {imbalance['positive']} ({imbalance['positive_ratio']:.6f})")
        print(f"  Negative: {imbalance['negative']} ({imbalance['negative_ratio']:.6f})")
        print(f"  Unknown: {imbalance['unknown']} ({imbalance['unknown_ratio']:.6f})")

print("Training Set Imbalance Ratios:")
display_imbalance_ratios(df_mf_interactions_train, df_two_tower_interactions_train, df_user_features_train, df_items_features_train)
print("\nValidation Set Imbalance Ratios:")
display_imbalance_ratios(df_mf_interactions_val, df_two_tower_interactions_val, df_user_features_val, df_items_features_val)

Training Set Imbalance Ratios:
MF Interactions Imbalance:
  Positive: 3356405 (0.000046)
  Negative: 0 (0.000000)
  Unknown: 72204879589 (0.999954)
Two-Tower Interactions Imbalance:
  Positive: 3356405 (0.000046)
  Negative: 67954578 (0.000941)
  Unknown: 72136925011 (0.999012)

Validation Set Imbalance Ratios:
MF Interactions Imbalance:
  Positive: 571411 (0.000031)
  Negative: 0 (0.000000)
  Unknown: 18436596359 (0.999969)
Two-Tower Interactions Imbalance:
  Positive: 571411 (0.000031)
  Negative: 12068886 (0.000655)
  Unknown: 18424527473 (0.999314)


## Saving the processed data

In [24]:
def save_processed_data(base_path, df_items_features, df_user_features, df_mf_interactions, df_two_tower_interactions):
    os.makedirs(base_path, exist_ok=True)

    # Save item features
    df_items_features.to_csv(base_path + 'item_features.csv', index=False)

    # Save user features
    df_user_features.to_csv(base_path + 'user_features.csv', index=False)

    # Save MF interactions
    df_mf_interactions.to_csv(base_path + 'mf_interactions.csv', index=False)

    # Save Two-Tower interactions
    df_two_tower_interactions.to_csv(base_path + 'two_tower_interactions.csv', index=False)

print("Saving train")
save_processed_data('./data/large/train/processed/',
                    df_items_features_train,
                    df_user_features_train,
                    df_mf_interactions_train,
                    df_two_tower_interactions_train)
print("Saving val")
save_processed_data('./data/large/dev/processed/',
                    df_items_features_val,
                    df_user_features_val,
                    df_mf_interactions_val,
                    df_two_tower_interactions_val)

Saving train
Saving val


In [28]:
df_two_tower_interactions_val['labels'].apply(sum).describe()

count    376471.000000
mean          1.526930
std           1.169522
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          39.000000
Name: labels, dtype: float64