# Data Download and Extraction

In [27]:
import os
import zipfile
import urllib.request
from typing import Literal


def get_url(small=True, set_type=Literal['train', 'dev', 'test']):
    if set_type == 'test' and small:
        raise ValueError("Small test set is not available.")
    return f"https://huggingface.co/datasets/yjw1029/MIND/resolve/main/MIND{"small" if small else "large"}_{set_type}.zip"

def download_and_extract_zip(url, extract_to='./data'):
    filename = url.split('/')[-1]
    if not os.path.exists(filename):
        print(f"Downloading {filename}...")
        urllib.request.urlretrieve(url, filename)
        print("Download complete.")
    else:
        print(f"{filename} already exists.")

    if filename.endswith('.zip'):
        print(f"Extracting {filename}...")
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Extraction complete.")

    # Clean up the zip file
    os.remove(filename)

    return extract_to

def download_dataset(small=True, ignore_cache=False):
    set_types = ['train', 'dev']
    if not small:
        set_types.append('test')

    for set_type in set_types:
        url = get_url(small=small, set_type=set_type)
        path = f'./data/{"small" if small else "large"}/{set_type}'
        if ignore_cache or not os.path.exists(path):
            download_and_extract_zip(url, extract_to=f'./data/{"small" if small else "large"}/{set_type}')
        else:
            print(f"{set_type} set already exists at {path}, skipping download.")

In [28]:
download_dataset(small=False)

train set already exists at ./data/large/train, skipping download.
dev set already exists at ./data/large/dev, skipping download.
test set already exists at ./data/large/test, skipping download.


# Load and aggregate data files
The goal is to take the raw data files and create 3 csv files: user_features.csv, item_features.csv, interactions.csv

Right now, we have:
- `behaviors.tsv`: The click histories and impression logs of users
- `news.tsv`: The information of news articles
- `entity_embedding.vec`: The embeddings of entities in news extracted from knowledge graph. We might use this later. For now, we will just use the text information in the news articles.
- `relation_embedding.vec`: The embeddings of relations between entities extracted from knowledge graph. In our case, we won't be using this file. This is more useful when we want to use graph neural networks over the knowledge graph.

So the plan is to first create the item features (since the user features will depend on the item features as we will see later), then create the user features, and finally create the interactions file.

## Item Features
Right now, in `news.tsv` we have the following columns: News ID, Category, SubCategory, Title, Abstract, URL (many have expired by now), Title Entities (entities contained in the title of this news), Abstract Entities (entites contained in the abstract of this news).

For now, we won't try retrieving the content of the news articles using the URLs (or finding them through web search) since that would be too time consuming and is likely not necessary to showcase the benefits of the Two-Tower model. In a real world scenario, it would be ideal to retrieve the full text of the articles to get better representations.

Here is how we will initially create the item features:
- Combine the title and abstract into a single text field then use MiniLM to create text embeddings for each news article
- Use the subcategory (not category as well since redundant) as learnable embeddings during model training (better than one-hot encoding since there are many subcategories)
- Potential future improvement: Use the weighted average of the entity embeddings (from `entity_embedding.vec`) as another set of features

In [29]:
import pandas as pd

train_path = './data/small/train/MINDsmall_train/'
val_path = './data/small/dev/MINDsmall_dev/'

def get_news_df(path):
    return pd.read_csv(path + 'news.tsv', sep='\t', header=None, names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])

df_news_train = get_news_df(train_path)
df_news_val = get_news_df(val_path)
df_news_train

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."
...,...,...,...,...,...,...,...,...
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid..."
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[]
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]


In [30]:
def get_entities_df(path):
    df_ = pd.read_csv(path + 'entity_embedding.vec', sep='\t', index_col=0, header=None)
    df_.index.name = 'entity_id'
    return df_

df_entities_train = get_entities_df(train_path)
df_entities_val = get_entities_df(val_path)
df_entities_train

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,92,93,94,95,96,97,98,99,100,101
entity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q41,-0.063388,-0.181451,0.057501,-0.091254,-0.076217,-0.052525,0.050500,-0.224871,-0.018145,0.030722,...,0.001861,0.124535,-0.151043,-0.263698,-0.103607,0.020007,-0.101157,-0.091567,0.035234,
Q1860,0.060958,0.069934,0.015832,0.079471,-0.023362,-0.125007,-0.043618,0.134063,-0.121691,0.089166,...,-0.014287,0.013578,0.099977,0.012199,-0.141138,0.056129,-0.133727,0.025795,0.051448,
Q39631,-0.093106,-0.052002,0.020556,-0.020801,0.043180,-0.072321,0.000910,0.028156,0.176303,0.035396,...,-0.086840,-0.078992,-0.062712,0.051117,-0.184307,0.127637,-0.144866,0.044690,0.013498,
Q30,-0.115737,-0.179113,0.102739,-0.112469,-0.101853,-0.177516,0.015860,-0.092626,0.086708,0.057850,...,0.080511,-0.000085,-0.089968,-0.083486,-0.149992,-0.053031,-0.136071,-0.029001,0.174155,
Q60,-0.051036,-0.165637,0.132802,-0.089949,-0.146637,-0.142246,0.103853,-0.129651,0.096265,0.017288,...,0.078628,0.003711,-0.058953,-0.154067,-0.117159,-0.031614,-0.140451,0.001288,0.140350,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q42225228,-0.051346,-0.028947,-0.075870,0.017512,-0.066910,-0.027968,0.020820,0.089614,-0.024825,0.018390,...,-0.024313,0.034771,0.044909,0.043314,-0.020466,-0.009811,0.028233,-0.015396,0.063876,
Q54860678,-0.031620,-0.041283,-0.016871,0.044251,-0.022709,0.014341,0.034043,0.018170,0.002922,-0.024684,...,0.033638,0.070052,0.043695,-0.060374,0.036932,-0.014621,0.048494,-0.029104,0.057286,
Q54860790,0.034682,-0.009413,-0.024317,0.073895,0.028052,0.028039,0.039260,0.017398,0.017743,-0.007708,...,0.034473,0.035736,0.008329,-0.049981,-0.025212,-0.018404,0.004110,0.013771,-0.008027,
Q54862508,-0.052323,-0.078029,-0.060925,-0.052536,0.006802,-0.070488,-0.081736,0.026385,-0.037127,0.057764,...,0.045298,0.009842,-0.019821,-0.033952,-0.047436,0.062752,0.043236,0.032251,-0.001261,


In [31]:
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device)

Using device: cuda


In [32]:
df_items_features_train = df_news_train.copy()
df_items_features_val = df_news_val.copy()

def create_df_items_features(df_news):
    df_items_features = df_news.copy()

    # Combine title and abstract
    texts = (df_news['title'].fillna('') + ' ' + df_news['abstract'].fillna('')).str.strip()

    # Create text embeddings
    embeddings = model.encode(texts.tolist(), show_progress_bar=True, normalize_embeddings=True, batch_size=512, device=device)
    df_items_features['text_embedding'] = embeddings.tolist()

    # Make the subcategory an int (this will be used as learnable embeddings during model training)
    df_items_features['subcategory'] = df_items_features['subcategory'].astype('category').cat.codes

    # Keep only relevant columns
    df_items_features = df_items_features[['news_id', 'subcategory', 'text_embedding']]

    return df_items_features

df_items_features_train = create_df_items_features(df_news_train)
df_items_features_val = create_df_items_features(df_news_val)

Batches:   0%|          | 0/101 [00:00<?, ?it/s]

Batches:   0%|          | 0/83 [00:00<?, ?it/s]

In [33]:
df_items_features_train

Unnamed: 0,news_id,subcategory,text_embedding
0,N55528,142,"[-0.026684677228331566, 0.08572900295257568, 0..."
1,N19639,259,"[0.028437595814466476, 0.03309028968214989, 0...."
2,N61837,191,"[-0.020724531263113022, 0.12053246051073074, 0..."
3,N53526,254,"[0.02258925884962082, 0.10759174078702927, 0.0..."
4,N38324,151,"[0.01198355108499527, 0.07133994996547699, 0.0..."
...,...,...,...
51277,N16909,257,"[0.025076350197196007, 0.009858817793428898, 0..."
51278,N47585,129,"[0.03800557181239128, 0.029407819733023643, -0..."
51279,N7482,156,"[0.06913388520479202, 0.07006345689296722, 0.0..."
51280,N34418,220,"[-0.029744597151875496, 0.014320609159767628, ..."


## User Features
In `behaviors.tsv` we have the following columns: Impressions ID, User ID, Time, History (ordered clicked news IDs separated by space), Impressions (the list of news IDs shown to the user with click labels).

The way I understand it is that each row corresponds to a single session that starts at <Time> for <User ID>. During this session, the user was shown the list of news articles in <Impressions> and clicked on some of them (indicated by the label 1). The <History> column contains the news articles that the user had clicked on in the past.

These impressions are those made by randomly selected users during the 5th week of the MIND dataset collection period. The history contains the clicked news articles from the first 4 weeks. This means that a user will have the same history for all impressions in week 5 even if in theory, their history technically evolves each time they read an article in week 5. To stay faithful to the dataset, we will not augment the history with clicks made in week 5.

Here is how we will create the user features:
- **History-based text embeddings with prior**: For each user, we will take the news articles in their history, get the text embeddings from the item features we created earlier, and average them to create a single text embedding representing the user's interests. To avoid issues with users who have very short histories or no histories, we will add a prior embedding which is the average embedding of all news articles in the dataset. This way, even if a user has only clicked on one article, their embedding will still contain some general information about the news domain. The more articles a user has in their history, the less influence the prior embedding will have. The formula is as follows:
\begin{align*}
\text{history\_embedding} = \frac{\lambda \cdot \text{prior\_embedding} + \sum_{i=1}^{N} \text{article\_embedding}_i}{\lambda + N}
\end{align*}
where $N$ is the number of articles in the user's history.
- **History-length ($N$)**: The number of articles in the user's history scaled and normalized. This gives the model a sense of how active the user is. This can also help the model distinguish between users with very short histories and those with long histories but that read articles around the global prior.
- That's it (unfortunately we don't have more demographic information about the users in this dataset). Ideally, we would want things such as age, gender, location, reading patterns, etc, etc. However, as with many public datasets, this information is not available due to privacy concerns.

In [34]:
def load_behaviors(path):
    df_ = pd.read_csv(path + 'behaviors.tsv', sep='\t', header=None, names=['impression_id', 'user_id', 'time', 'history', 'impressions'])
    df_['time'] = pd.to_datetime(df_['time'])
    return df_

df_behaviors_train = load_behaviors(train_path)
df_behaviors_val = load_behaviors(val_path)
df_behaviors_train

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,2019-11-11 09:05:58,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,2019-11-12 18:11:30,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,2019-11-14 07:01:48,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,2019-11-11 05:28:05,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,2019-11-12 16:11:21,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...
...,...,...,...,...,...
156960,156961,U21593,2019-11-14 22:24:05,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...
156961,156962,U10123,2019-11-13 06:57:04,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...
156962,156963,U75630,2019-11-14 10:58:13,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...
156963,156964,U44625,2019-11-13 14:57:02,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...


In [35]:
# Inspect a random user's impressions
random_user_id = df_behaviors_train['user_id'].sample(1, random_state=42).values[0]
user_impression = df_behaviors_train[df_behaviors_train['user_id'] == random_user_id]
user_impression

Unnamed: 0,impression_id,user_id,time,history,impressions
11917,11918,U46778,2019-11-11 08:33:03,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N55689-1 N11830-0
24212,24213,U46778,2019-11-14 07:45:56,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N47576-0 N14436-0 N16439-0 N60550-1 N33831-0 N...
32716,32717,U46778,2019-11-09 18:15:44,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N6926-1 N46597-0 N12579-0 N14780-0 N38783-0 N4...
35012,35013,U46778,2019-11-11 10:31:58,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N11830-0 N52622-1
70566,70567,U46778,2019-11-09 08:16:42,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N41881-0 N6926-0 N7128-0 N3894-0 N38783-0 N547...
77856,77857,U46778,2019-11-12 08:42:49,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N3123-0 N7319-1
78833,78834,U46778,2019-11-12 18:57:31,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N60456-0 N19592-0 N35738-1 N3123-0 N12029-0 N3...
91163,91164,U46778,2019-11-12 10:26:05,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N58363-1 N3123-0
97060,97061,U46778,2019-11-11 11:33:39,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N26262-0 N11830-0 N46597-0 N53585-1
120448,120449,U46778,2019-11-10 07:58:28,N17157 N28081 N58641 N3926 N16290 N11894 N6185...,N36703-0 N37394-0 N6926-0 N61768-1 N58051-0 N2...


In [36]:
import numpy as np

def create_df_user_features(df_behaviors, df_items_features):
    df_user_features = pd.DataFrame()

    # Precompute prior embedding
    all_embeddings = np.vstack(df_items_features['text_embedding'].values)
    prior_embedding = np.mean(all_embeddings, axis=0)

    # Hash map of embeddings for faster lookup
    item_embedding_map = dict(zip(df_items_features['news_id'], df_items_features['text_embedding']))

    def compute_user_embedding(history, lambda_prior=5.0):
        if pd.isna(history) or history.strip() == '':
            return prior_embedding
        history_ids = history.split()
        history_embeddings = [item_embedding_map[news_id] for news_id in history_ids if news_id in item_embedding_map]
        if len(history_embeddings) != len(history_ids):
            print(f"Warning: Some history IDs not found in item embeddings: {set(history_ids) - set(item_embedding_map.keys())}")
        N = len(history_embeddings)  # even if 0, the following formula works
        history_embedding = (lambda_prior * prior_embedding + np.sum(history_embeddings, axis=0)) / (lambda_prior + N)
        return history_embedding

    df_user_features['user_id'] = df_behaviors['user_id'].unique()
    df_user_features = df_user_features.merge(
        df_behaviors[['user_id', 'history']].drop_duplicates().set_index('user_id'),
        left_on='user_id', right_index=True
    )
    df_user_features['history_embedding'] = df_user_features['history'].apply(compute_user_embedding)

    # History length feature
    history_lengths = df_user_features['history'].apply(lambda x: 0 if pd.isna(x) or x.strip() == '' else len(x.split()))
    df_user_features['history_length'] = (history_lengths - history_lengths.mean()) / history_lengths.std()

    df_user_features = df_user_features[['user_id', 'history_embedding', 'history_length']]
    return df_user_features

df_user_features_train = create_df_user_features(df_behaviors_train, df_items_features_train)
df_user_features_val = create_df_user_features(df_behaviors_val, df_items_features_val)

In [37]:
df_user_features_train

Unnamed: 0,user_id,history_embedding,history_length
0,U13740,"[-0.00885110250146098, -0.003975465073649558, ...",-0.398364
1,U91836,"[0.0026773895667371794, 0.03790082620294767, -...",2.655943
2,U73700,"[0.017035521501647954, 0.01329099544714237, 0....",-0.105485
3,U34670,"[-0.013947716803638602, 0.007622674624621695, ...",-0.356524
4,U8125,"[0.0005637002984045846, 0.02244883284986818, 0...",-0.607563
...,...,...,...
49995,U6794,"[-0.010911250849057422, 0.012214820269029425, ...",-0.314684
49996,U23127,"[0.0011560585791971715, 0.01703648934666324, -...",0.187394
49997,U43157,"[-0.021493115675178247, 0.00383875014532624, -...",-0.440203
49998,U66493,"[0.002582997777208714, 0.001990724630653714, 0...",-0.231004


## Interactions
Here, we actually need to create 2 seperate interaction matrix files, one for each model.
### Matrix Factorization model
The first thing to note here is that the vanilla MF model doesn't differentiate between negative interactions (items that were shown but not clicked) and unknown interactions (items that were never shown). This is a limitation of the MF model since in reality, we would want to treat these 2 types of interactions differently. There are variations of MF that can handle this [[1]](http://yifanhu.net/PUB/cf.pdf). However, since the goal is to compare the Two-Tower model against the vanilla MF model, we will stick to the basic MF formulation here.

The second thing is that there might be multiple impressions for the same user and item pair. For example, a user might have been shown the same article multiple times during the week. It is also possible that the user clicked on the article one time but not the other times. However, for the MF model, we can only have a single interaction value for each user-item pair. To resolve this, we will define the interaction value as the ratio of clicks to impressions for that user-item pair. For example, if a user was shown an article 3 times and clicked on it once, the interaction value will be 1/3 = 0.33. If the user never clicked on it, the interaction value will be 0. If the user always clicked on it, the interaction value will be 1.

Finally, saving the interaction matrix in a dense format (i.e., a full matrix with all user-item pairs) would be very inefficient in terms of storage since most entries would be zero (sparse interactions). Therefore, we will save the interaction matrix in a sparse format where we only store the non-zero interactions.

### Two-Tower model
Interactions for the two-tower model are simply different data samples. This means that if a user was shown an article multiple times, each impression will be a separate data sample. The label will be 1 if the user clicked on the article, 0 if the user was shown the article but did not click on it and there will be no data sample for articles that were never shown to the user.

In [38]:
def get_ctr_labels(impressions_str):
    impressions = impressions_str.split()
    items_to_ctr = {}
    for imp in impressions:
        news_id, label = imp.split('-')
        label = int(label)
        if news_id not in items_to_ctr:
            items_to_ctr[news_id] = [0, 0]  # [clicks, impressions]
        items_to_ctr[news_id][1] += 1
        if label == 1:
            items_to_ctr[news_id][0] += 1

    items_to_ctr = [(news_id, clicks / imps) for news_id, (clicks, imps) in items_to_ctr.items() if clicks > 0]
    return items_to_ctr

def create_mf_interactions(df_behaviors):
    df_mf_interactions = df_behaviors[['user_id', 'impressions']].copy()
    df_mf_interactions = df_mf_interactions.groupby('user_id')['impressions'].apply(lambda x: ' '.join(x)).reset_index()
    df_mf_interactions['impressions'] = df_mf_interactions['impressions'].apply(get_ctr_labels)

    # make each user-item pair a separate row
    df_mf_interactions = df_mf_interactions.explode('impressions')
    df_mf_interactions['news_id'] = df_mf_interactions['impressions'].apply(lambda x: x[0])
    df_mf_interactions['label'] = df_mf_interactions['impressions'].apply(lambda x: x[1])
    df_mf_interactions = df_mf_interactions[['user_id', 'news_id', 'label']]

    return df_mf_interactions

df_mf_interactions_train = create_mf_interactions(df_behaviors_train)
df_mf_interactions_val = create_mf_interactions(df_behaviors_val)

df_mf_interactions_train

Unnamed: 0,user_id,news_id,label
0,U100,N7800,1.0
1,U1000,N29739,1.0
1,U1000,N7670,1.0
1,U1000,N53875,0.5
1,U1000,N58656,1.0
...,...,...,...
49998,U9997,N42459,1.0
49998,U9997,N24423,1.0
49999,U9999,N26706,1.0
49999,U9999,N23085,1.0


In [39]:
def create_two_tower_interactions(df_behaviors):
    df_two_tower_interactions = df_behaviors[['user_id', 'impressions']].copy()
    df_two_tower_interactions['impressions'] = df_two_tower_interactions['impressions'].apply(lambda x: [imp.split('-') for imp in x.split()])
    df_two_tower_interactions = df_two_tower_interactions.explode('impressions')
    df_two_tower_interactions['news_id'] = df_two_tower_interactions['impressions'].apply(lambda x: x[0])
    df_two_tower_interactions['label'] = df_two_tower_interactions['impressions'].apply(lambda x: int(x[1]))
    df_two_tower_interactions = df_two_tower_interactions[['user_id', 'news_id', 'label']]
    return df_two_tower_interactions

df_two_tower_interactions_train = create_two_tower_interactions(df_behaviors_train)
df_two_tower_interactions_val = create_two_tower_interactions(df_behaviors_val)
df_two_tower_interactions_train

Unnamed: 0,user_id,news_id,label
0,U13740,N55689,1
0,U13740,N35729,0
1,U91836,N20678,0
1,U91836,N39317,0
1,U91836,N58114,0
...,...,...,...
156963,U44625,N39317,0
156964,U64800,N61233,0
156964,U64800,N33828,1
156964,U64800,N19661,0


In [40]:
# Compute the imbalance ratio for each dataset
def compute_imbalance_ratio(df_, num_users, num_items):
    positive = (df_['label'] != 0).sum()
    negative = len(df_) - positive
    unknown = num_users * num_items - len(df_)
    return {'positive': positive, 'negative': negative, 'unknown': unknown,
            'positive_ratio': positive / (positive + negative + unknown),
            'negative_ratio': negative / (positive + negative + unknown),
            'unknown_ratio': unknown / (positive + negative + unknown)}

def display_imbalance_ratios(df_mf, df_two_tower, df_user_features, df_item_features):
    num_users = df_user_features.shape[0]
    num_items = df_item_features.shape[0]

    # Display imbalance ratios cleanly
    for model_name, df_ in [('MF', df_mf), ('Two-Tower', df_two_tower)]:
        imbalance = compute_imbalance_ratio(df_, num_users, num_items)
        print(f"{model_name} Interactions Imbalance:")
        print(f"  Positive: {imbalance['positive']} ({imbalance['positive_ratio']:.6f})")
        print(f"  Negative: {imbalance['negative']} ({imbalance['negative_ratio']:.6f})")
        print(f"  Unknown: {imbalance['unknown']} ({imbalance['unknown_ratio']:.6f})")

print("Training Set Imbalance Ratios:")
display_imbalance_ratios(df_mf_interactions_train, df_two_tower_interactions_train, df_user_features_train, df_items_features_train)
print("\nValidation Set Imbalance Ratios:")
display_imbalance_ratios(df_mf_interactions_val, df_two_tower_interactions_val, df_user_features_val, df_items_features_val)

Training Set Imbalance Ratios:
MF Interactions Imbalance:
  Positive: 234468 (0.000091)
  Negative: 0 (0.000000)
  Unknown: 2563865532 (0.999909)
Two-Tower Interactions Imbalance:
  Positive: 236344 (0.000092)
  Negative: 5607100 (0.002187)
  Unknown: 2558256556 (0.997721)

Validation Set Imbalance Ratios:
MF Interactions Imbalance:
  Positive: 110745 (0.000052)
  Negative: 0 (0.000000)
  Unknown: 2120689255 (0.999948)
Two-Tower Interactions Imbalance:
  Positive: 111383 (0.000053)
  Negative: 2629615 (0.001240)
  Unknown: 2118059002 (0.998708)


## Saving the processed data

In [41]:
def save_processed_data(base_path, df_items_features, df_user_features, df_mf_interactions, df_two_tower_interactions):
    os.makedirs(base_path, exist_ok=True)

    # Save item features
    df_items_features.to_csv(base_path + 'item_features.csv', index=False)

    # Save user features
    df_user_features.to_csv(base_path + 'user_features.csv', index=False)

    # Save MF interactions
    df_mf_interactions.to_csv(base_path + 'mf_interactions.csv', index=False)

    # Save Two-Tower interactions
    df_two_tower_interactions.to_csv(base_path + 'two_tower_interactions.csv', index=False)

save_processed_data('./data/small/train/processed/',
                    df_items_features_train,
                    df_user_features_train,
                    df_mf_interactions_train,
                    df_two_tower_interactions_train)
save_processed_data('./data/small/dev/processed/',
                    df_items_features_val,
                    df_user_features_val,
                    df_mf_interactions_val,
                    df_two_tower_interactions_val)