In [19]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import load_npz
from tqdm import tqdm

In [20]:

folder_name = "data"
file1 = folder_name + "\\articles.parquet"
file2 = folder_name + "\\train\\behaviors.parquet"
file3 = folder_name + "\\train\\history.parquet"
file4 = folder_name + "\\validation\\behaviors.parquet"
file5 = folder_name + "\\validation\\history.parquet"



#   -->      Datasets     <--  #

# Articles
Articles = pd.read_parquet(file1)

# Test set
Bhv_test = pd.read_parquet(file2)
Hstr_test = pd.read_parquet(file3)

# Validation set
Bhv_val = pd.read_parquet(file4)
Hstr_val = pd.read_parquet(file5)

# Data exploration

In [None]:
import pandas as pd
import numpy as np

def data_exploration(data_folder):
    # File paths
    file2 = f"{data_folder}\\train\\behaviors.parquet"  # Bhv_test
    file3 = f"{data_folder}\\train\\history.parquet"    # Hstr_test
    file4 = f"{data_folder}\\validation\\behaviors.parquet"  # Bhv_val
    file5 = f"{data_folder}\\validation\\history.parquet"    # Hstr_val

    # Load datasets
    Bhv_test = pd.read_parquet(file2)
    Hstr_test = pd.read_parquet(file3)
    Bhv_val = pd.read_parquet(file4)
    Hstr_val = pd.read_parquet(file5)

    # Helper function for safe evaluation
    def safe_eval(x):
        try:
            return eval(x) if isinstance(x, str) else x
        except:
            return []

    # --- Bhv_test Stats ---
    print("=== Bhv_test Stats ===")
    print("\nScroll Percentage Stats:")
    print(Bhv_test['scroll_percentage'].describe())
    print(f"NaN in scroll_percentage: {Bhv_test['scroll_percentage'].isna().sum()}")

    print("\nRead Time Stats:")
    print(Bhv_test['read_time'].describe())
    print(f"NaN in read_time: {Bhv_test['read_time'].isna().sum()}")

    Bhv_test['clicked_match'] = Bhv_test.apply(
        lambda row: row['article_id'] in safe_eval(row['article_ids_clicked']) if pd.notna(row['article_id']) else False, 
        axis=1
    )
    print(f"\nRows where article_id matches clicked: {Bhv_test['clicked_match'].sum()} / {len(Bhv_test)}")

    # --- Hstr_test Stats ---
    print("\n=== Hstr_test Stats ===")
    Hstr_test['article_ids'] = Hstr_test['article_id_fixed'].apply(safe_eval)
    Hstr_test['read_times'] = Hstr_test['read_time_fixed'].apply(safe_eval)
    Hstr_test['scroll_percents'] = Hstr_test['scroll_percentage_fixed'].apply(safe_eval)

    Hstr_test_exploded = Hstr_test.explode('article_ids')
    Hstr_test_exploded['read_time'] = pd.to_numeric(Hstr_test['read_times'].explode(), errors='coerce')
    Hstr_test_exploded['scroll_percentage'] = pd.to_numeric(Hstr_test['scroll_percents'].explode(), errors='coerce')

    print("\nExploded Scroll Percentage Stats:")
    print(Hstr_test_exploded['scroll_percentage'].describe())
    print(f"NaN in scroll_percentage: {Hstr_test_exploded['scroll_percentage'].isna().sum()}")

    print("\nExploded Read Time Stats:")
    print(Hstr_test_exploded['read_time'].describe())
    print(f"NaN in read_time: {Hstr_test_exploded['read_time'].isna().sum()}")

    # --- Bhv_val Stats ---
    print("\n=== Bhv_val Stats ===")
    print("\nScroll Percentage Stats:")
    print(Bhv_val['scroll_percentage'].describe())
    print(f"NaN in scroll_percentage: {Bhv_val['scroll_percentage'].isna().sum()}")

    print("\nRead Time Stats:")
    print(Bhv_val['read_time'].describe())
    print(f"NaN in read_time: {Bhv_val['read_time'].isna().sum()}")

    Bhv_val['clicked_match'] = Bhv_val.apply(
        lambda row: row['article_id'] in safe_eval(row['article_ids_clicked']) if pd.notna(row['article_id']) else False, 
        axis=1
    )
    print(f"\nRows where article_id matches clicked: {Bhv_val['clicked_match'].sum()} / {len(Bhv_val)}")

    # --- Hstr_val Stats ---
    print("\n=== Hstr_val Stats ===")
    Hstr_val['article_ids'] = Hstr_val['article_id_fixed'].apply(safe_eval)
    Hstr_val['read_times'] = Hstr_val['read_time_fixed'].apply(safe_eval)
    Hstr_val['scroll_percents'] = Hstr_val['scroll_percentage_fixed'].apply(safe_eval)

    Hstr_val_exploded = Hstr_val.explode('article_ids')
    Hstr_val_exploded['read_time'] = pd.to_numeric(Hstr_val['read_times'].explode(), errors='coerce')
    Hstr_val_exploded['scroll_percentage'] = pd.to_numeric(Hstr_val['scroll_percents'].explode(), errors='coerce')

    print("\nExploded Scroll Percentage Stats:")
    print(Hstr_val_exploded['scroll_percentage'].describe())
    print(f"NaN in scroll_percentage: {Hstr_val_exploded['scroll_percentage'].isna().sum()}")

    print("\nExploded Read Time Stats:")
    print(Hstr_val_exploded['read_time'].describe())
    print(f"NaN in read_time: {Hstr_val_exploded['read_time'].isna().sum()}")


#data_exploration('data')

=== Bhv_test Stats ===

Scroll Percentage Stats:
count    69098.000000
mean        99.566208
std          4.944662
min          7.000000
25%        100.000000
50%        100.000000
75%        100.000000
max        100.000000
Name: scroll_percentage, dtype: float64
NaN in scroll_percentage: 163789

Read Time Stats:
count    232887.000000
mean         43.901806
std          90.299965
min           0.000000
25%          11.000000
50%          21.000000
75%          45.000000
max        1799.000000
Name: read_time, dtype: float64
NaN in read_time: 0

Rows where article_id matches clicked: 368 / 232887

=== Hstr_test Stats ===

Exploded Scroll Percentage Stats:
count    2.171171e+06
mean     6.822356e+01
std      3.231243e+01
min      0.000000e+00
25%      3.500000e+01
50%      7.700000e+01
75%      1.000000e+02
max      1.000000e+02
Name: scroll_percentage, dtype: float64
NaN in scroll_percentage: 255076

Exploded Read Time Stats:
count    2.426247e+06
mean     6.137053e+01
std      1.6543

# Constructing the sparse matrix

In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from tqdm import tqdm

def create_sparse(data_folder, output_file='user_item_likes_matrix_all.npz'):
    # File paths
    file1 = f"{data_folder}\\articles.parquet"
    file2 = f"{data_folder}\\train\\behaviors.parquet"  # Test behaviors
    file3 = f"{data_folder}\\train\\history.parquet"    # Test history
    file4 = f"{data_folder}\\validation\\behaviors.parquet"  # Val behaviors
    file5 = f"{data_folder}\\validation\\history.parquet"    # Val history

    # Load datasets
    Articles = pd.read_parquet(file1)
    Bhv_test = pd.read_parquet(file2)  # Changed to Bhv_test
    Hstr_test = pd.read_parquet(file3) # Changed to Hstr_test
    Bhv_val = pd.read_parquet(file4)
    Hstr_val = pd.read_parquet(file5)

    # Helper function for safe evaluation
    def safe_eval(x):
        try:
            return eval(x) if isinstance(x, str) else x
        except:
            return []

    # Preprocess Bhv_test
    Bhv_test = Bhv_test.dropna(subset=['article_id'])
    Bhv_test['article_id'] = Bhv_test['article_id'].astype(int)
    merged_test = Bhv_test.merge(Articles[['article_id', 'body', 'article_type']], 
                                 on='article_id', how='left')

    # Preprocess Hstr_test
    Hstr_test['article_id_fixed'] = Hstr_test['article_id_fixed'].apply(safe_eval)
    Hstr_test['read_time_fixed'] = Hstr_test['read_time_fixed'].apply(safe_eval)
    Hstr_test['scroll_percentage_fixed'] = Hstr_test['scroll_percentage_fixed'].apply(safe_eval)
    Hstr_test_exploded = Hstr_test.explode('article_id_fixed')
    Hstr_test_exploded['read_time'] = pd.to_numeric(Hstr_test['read_time_fixed'].explode(), errors='coerce')
    Hstr_test_exploded['scroll_percentage'] = pd.to_numeric(Hstr_test['scroll_percentage_fixed'].explode(), errors='coerce')
    Hstr_test_exploded['article_id'] = pd.to_numeric(Hstr_test_exploded['article_id_fixed'], errors='coerce')
    Hstr_test_exploded = Hstr_test_exploded.dropna(subset=['article_id'])
    Hstr_test_exploded['article_id'] = Hstr_test_exploded['article_id'].astype(int)
    Hstr_test_merged = Hstr_test_exploded.merge(Articles[['article_id', 'body', 'article_type']], 
                                                on='article_id', how='left')

    # Preprocess Bhv_val
    Bhv_val = Bhv_val.dropna(subset=['article_id'])
    Bhv_val['article_id'] = Bhv_val['article_id'].astype(int)
    merged_val = Bhv_val.merge(Articles[['article_id', 'body', 'article_type']], 
                              on='article_id', how='left')

    # Preprocess Hstr_val
    Hstr_val['article_id_fixed'] = Hstr_val['article_id_fixed'].apply(safe_eval)
    Hstr_val['read_time_fixed'] = Hstr_val['read_time_fixed'].apply(safe_eval)
    Hstr_val['scroll_percentage_fixed'] = Hstr_val['scroll_percentage_fixed'].apply(safe_eval)
    Hstr_val_exploded = Hstr_val.explode('article_id_fixed')
    Hstr_val_exploded['read_time'] = pd.to_numeric(Hstr_val['read_time_fixed'].explode(), errors='coerce')
    Hstr_val_exploded['scroll_percentage'] = pd.to_numeric(Hstr_val['scroll_percentage_fixed'].explode(), errors='coerce')
    Hstr_val_exploded['article_id'] = pd.to_numeric(Hstr_val_exploded['article_id_fixed'], errors='coerce')
    Hstr_val_exploded = Hstr_val_exploded.dropna(subset=['article_id'])
    Hstr_val_exploded['article_id'] = Hstr_val_exploded['article_id'].astype(int)
    Hstr_val_merged = Hstr_val_exploded.merge(Articles[['article_id', 'body', 'article_type']], 
                                             on='article_id', how='left')

    # Combine all data
    combined_df = pd.concat([merged_test, Hstr_test_merged, merged_val, Hstr_val_merged], ignore_index=True)
    combined_df['body_length'] = combined_df['body'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

    # Infer "Likes"
    def is_low_text_article(article_type, body_length):
        low_text_types = ['video']
        return (article_type in low_text_types) or (body_length < 500)

    combined_df['is_low_text'] = combined_df.apply(lambda row: is_low_text_article(row['article_type'], row['body_length']), axis=1)
    combined_df['scroll_percentage'] = pd.to_numeric(combined_df['scroll_percentage'], errors='coerce').fillna(0)
    combined_df['read_time'] = pd.to_numeric(combined_df['read_time'], errors='coerce').fillna(0)

    epsilon = 1e-6
    combined_df['adjusted_scroll'] = np.where(
        combined_df['body_length'] > epsilon,
        combined_df['scroll_percentage'] / ((combined_df['body_length'] + epsilon) / 1000),
        combined_df['scroll_percentage']
    )

    user_stats = combined_df.groupby('user_id').agg({
        'adjusted_scroll': lambda x: np.percentile(x.dropna(), 25),
        'read_time': lambda x: np.percentile(x.dropna(), 25)
    }).rename(columns={'adjusted_scroll': 'scroll_threshold', 'read_time': 'read_threshold'})

    user_stats['scroll_threshold'] = user_stats['scroll_threshold'].fillna(0.05)
    user_stats['read_threshold'] = user_stats['read_threshold'].fillna(5)

    combined_df = combined_df.merge(user_stats, on='user_id', how='left')

    def infer_like(row):
        clicked = False
        if 'article_ids_inview' in row and 'article_ids_clicked' in row:
            inview = eval(row['article_ids_inview']) if isinstance(row['article_ids_inview'], str) else row['article_ids_inview']
            clicked = row['article_id'] in eval(row['article_ids_clicked']) if isinstance(row['article_ids_clicked'], str) else False
        
        scroll_ok = row['adjusted_scroll'] > row['scroll_threshold']
        read_ok = row['read_time'] > row['read_threshold']

        if row['is_low_text']:
            return read_ok or clicked
        elif row['body_length'] > 2000:
            return read_ok and (scroll_ok or clicked)
        else:
            return read_ok or (scroll_ok and clicked)

    combined_df['liked'] = combined_df.apply(infer_like, axis=1)

    # Debug stats
    print(f"Total interactions: {len(combined_df)}")
    print(f"Inferred likes: {combined_df['liked'].sum()}")
    print(f"Users with likes: {combined_df[combined_df['liked']]['user_id'].nunique()}")
    print(f"Articles with likes: {combined_df[combined_df['liked']]['article_id'].nunique()}")

    # Build User-Item Sparse Matrix
    all_users = combined_df['user_id'].unique()
    all_articles = combined_df['article_id'].unique()
    n_users = len(all_users)
    n_articles = len(all_articles)

    user_to_idx = {uid: i for i, uid in enumerate(all_users)}
    article_to_idx = {aid: j for j, aid in enumerate(all_articles)}

    liked_interactions = combined_df[combined_df['liked']][['user_id', 'article_id']].drop_duplicates()
    rows = [user_to_idx[uid] for uid in liked_interactions['user_id']]
    cols = [article_to_idx[aid] for aid in liked_interactions['article_id']]
    data = np.ones(len(liked_interactions), dtype=np.uint8)

    user_item_matrix = csr_matrix((data, (rows, cols)), shape=(n_users, n_articles))

    # Output and Save
    print(f"Matrix shape: {user_item_matrix.shape} (users: {n_users}, articles: {n_articles})")
    print(f"Number of non-zero entries: {user_item_matrix.nnz}")
    print(f"Sparsity: {user_item_matrix.nnz / (n_users * n_articles):.6f}")

    save_npz(output_file, user_item_matrix)

    # Example for first user
    user_idx = user_to_idx[all_users[0]]
    liked_articles = user_item_matrix[user_idx].nonzero()[1]
    print(f"Articles liked by user {all_users[0]}: {[all_articles[idx] for idx in liked_articles]}")

    # Return matrix and mappings
    return user_item_matrix, user_to_idx, article_to_idx

# Example usage

#user_item_matrix, user_to_idx, article_to_idx = create_sparse('data', 'user_item_likes_matrix_all.npz')

Total interactions: 4772143
Inferred likes: 3032215
Users with likes: 18827
Articles with likes: 10071
Matrix shape: (18827, 12068) (users: 18827, articles: 12068)
Number of non-zero entries: 1892965
Sparsity: 0.008332
Articles liked by user 151570: [np.int64(9778682), np.int64(9777492), np.int64(9778623), np.int64(9778718), np.int64(9773282), np.int64(9779269), np.int64(9779242), np.int64(9779227), np.int64(9774096), np.int64(9772830), np.int64(9772442), np.int64(9779263), np.int64(9780096), np.int64(9773297), np.int64(9776337), np.int64(9777307), np.int64(9778139), np.int64(9778168), np.int64(9773700), np.int64(9773486), np.int64(9773279), np.int64(9773887), np.int64(9774568), np.int64(9774527), np.int64(9773045), np.int64(9772502), np.int64(9772750), np.int64(9776259), np.int64(9775964), np.int64(9772545), np.int64(9772380), np.int64(9772227), np.int64(9772882), np.int64(9772099), np.int64(9774074), np.int64(9772453), np.int64(9772601), np.int64(9772517), np.int64(9773210), np.int64

# Explore the Data file (by loading it first)

In [23]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz

def Sparse_exploration(npz_file_path, data_folder):
    # Load the sparse matrix
    loaded_matrix = load_npz(npz_file_path)
    n_users, n_articles = loaded_matrix.shape

    # 1. Number of users without a liked article
    likes_per_user = loaded_matrix.sum(axis=1).A.ravel()  # Likes per user
    users_without_likes = np.sum(likes_per_user == 0)
    print(f"Number of users without a liked article: {users_without_likes}")

    # 2. Number of articles without a like
    likes_per_article = loaded_matrix.sum(axis=0).A.ravel()  # Likes per article
    articles_without_likes = np.sum(likes_per_article == 0)
    print(f"Number of articles without a like: {articles_without_likes}")

    # Load original data to get all interactions and user mappings
    Bhv_train = pd.read_parquet(f"{data_folder}\\train\\behaviors.parquet")
    Hstr_train = pd.read_parquet(f"{data_folder}\\train\\history.parquet")
    Bhv_val = pd.read_parquet(f"{data_folder}\\validation\\behaviors.parquet")
    Hstr_val = pd.read_parquet(f"{data_folder}\\validation\\history.parquet")

    # Combine all interactions (before like inference)
    combined_df = pd.concat([
        Bhv_train[['user_id', 'article_id']],
        Hstr_train.explode('article_id_fixed')[['user_id', 'article_id_fixed']].rename(columns={'article_id_fixed': 'article_id'}),
        Bhv_val[['user_id', 'article_id']],
        Hstr_val.explode('article_id_fixed')[['user_id', 'article_id_fixed']].rename(columns={'article_id_fixed': 'article_id'})
    ]).dropna(subset=['article_id'])

    # Ensure article_id is integer
    combined_df['article_id'] = combined_df['article_id'].astype(int)

    # Get all unique users and recreate mapping
    all_users = combined_df['user_id'].unique()
    user_to_idx = {uid: i for i, uid in enumerate(all_users)}

    # Calculate total unique articles interacted with per user
    user_interactions = combined_df.groupby('user_id')['article_id'].nunique()

    # 3. Top 10 users with most likes
    top_10_indices = np.argsort(likes_per_user)[::-1][:10]  # Top 10 indices by likes (descending)
    top_10_user_ids = [all_users[idx] for idx in top_10_indices]
    top_10_likes = [likes_per_user[idx] for idx in top_10_indices]
    top_10_total_interactions = [user_interactions.loc[uid] for uid in top_10_user_ids]

    print("\nTop 10 users with most liked articles and total interactions:")
    for user_id, like_count, total_count in zip(top_10_user_ids, top_10_likes, top_10_total_interactions):
        print(f"User {user_id}: {like_count} liked articles, {total_count} total articles interacted with")

    # 4. 10 users with smallest number of likes (excluding 0 if none exist)
    # Since all users have at least 1 like, sort ascending
    bottom_10_indices = np.argsort(likes_per_user)[:10]  # Bottom 10 indices by likes (ascending)
    bottom_10_user_ids = [all_users[idx] for idx in bottom_10_indices]
    bottom_10_likes = [likes_per_user[idx] for idx in bottom_10_indices]
    bottom_10_total_interactions = [user_interactions.loc[uid] for uid in bottom_10_user_ids]

    print("\n10 users with smallest number of liked articles and total interactions:")
    for user_id, like_count, total_count in zip(bottom_10_user_ids, bottom_10_likes, bottom_10_total_interactions):
        print(f"User {user_id}: {like_count} liked articles, {total_count} total articles interacted with")

    # 5. 10 random users
    random_indices = np.random.choice(n_users, 10, replace=False)  # 10 random indices
    random_user_ids = [all_users[idx] for idx in random_indices]
    random_likes = [likes_per_user[idx] for idx in random_indices]
    random_total_interactions = [user_interactions.loc[uid] for uid in random_user_ids]

    print("\n10 random users with liked articles and total interactions:")
    for user_id, like_count, total_count in zip(random_user_ids, random_likes, random_total_interactions):
        print(f"User {user_id}: {like_count} liked articles, {total_count} total articles interacted with")

    # Verify totals
    print(f"\nTotal users: {n_users}")
    print(f"Total articles: {n_articles}")

# Example usage
Sparse_exploration('user_item_likes_matrix_all.npz', 'data')

Number of users without a liked article: 0
Number of articles without a like: 1997

Top 10 users with most liked articles and total interactions:
User 1214299: 1117 liked articles, 1636 total articles interacted with
User 701722: 906 liked articles, 1327 total articles interacted with
User 892045: 820 liked articles, 1217 total articles interacted with
User 2082097: 809 liked articles, 1135 total articles interacted with
User 527084: 801 liked articles, 1263 total articles interacted with
User 26391: 786 liked articles, 1078 total articles interacted with
User 1377404: 774 liked articles, 1192 total articles interacted with
User 968619: 773 liked articles, 1148 total articles interacted with
User 151570: 765 liked articles, 1083 total articles interacted with
User 1021601: 757 liked articles, 1004 total articles interacted with

10 users with smallest number of liked articles and total interactions:
User 1163012: 1 liked articles, 1 total articles interacted with
User 1596067: 1 liked 