In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from src.main import data_normalization

## Loading datasets 

Runtime: roughly 3 minutes.

In [2]:
# Load the normalized data
data, embeddings = data_normalization()

# Extract relevant dataframes
news_df = data["news"]
behaviors_df = data["behaviors"]

behaviors_df.head()

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,1,U134050,11/15/2019 8:55:22 AM,"[N12246, N128820, N119226, N4065, N67770, N334...","[N91737-0, N30206-0, N54368-0, N117802-0, N181..."
1,2,U254959,11/15/2019 11:42:35 AM,"[N34011, N9375, N67397, N7936, N118985, N10945...","[N119999-0, N24958-0, N104054-0, N33901-0, N92..."
2,3,U499841,11/15/2019 9:08:21 AM,"[N63858, N26834, N6379, N85484, N15229, N65119...","[N18190-0, N89764-0, N91737-0, N54368-0, N4997..."
3,4,U107107,11/15/2019 5:50:31 AM,"[N12959, N8085, N18389, N3758, N9740, N90543, ...","[N122944-1, N18190-0, N55801-0, N59297-0, N128..."
4,5,U492344,11/15/2019 5:02:25 AM,"[N109183, N48453, N85005, N45706, N98923, N460...","[N64785-0, N82503-0, N32993-0, N122944-0, N291..."


The mistake i did last time was to assume "Clicked" and "News ID" was already there - I assume that i planned to make them, but wrote the draft and forgot to actually do it. 

"Impressions" will now be split on "-" into "News ID" and "Clicked". Then "User ID" will be added through the old dataset.

Runtime: 20 minutes - 1 hour

In [3]:
# Ensure required columns exist
if "Impressions" in behaviors_df.columns:
    # Split "Impressions" to extract News ID and Clicked values
    impressions_expanded = behaviors_df["Impressions"].astype(str).str.split(" ", expand=True).stack().reset_index(level=1, drop=True)
    impressions_df = impressions_expanded.astype(str).str.split("-", expand=True).rename(columns={0: "News ID", 1: "Clicked"})

    # Clean the "Clicked" column by stripping unwanted characters
    impressions_df["Clicked"] = impressions_df["Clicked"].str.replace(r"[^\d]", "", regex=True).astype(int)

    # Convert "clicked" to type int
    impressions_df["Clicked"] = impressions_df["Clicked"].astype(int)

    # Removing the ' that appears in the beginning of every News ID
    impressions_df["News ID"] = impressions_df["News ID"].str.strip("'")  # Remove leading quotes

    # Add back User ID from the original dataframe
    impressions_df = impressions_df.join(behaviors_df["User ID"], how="left")
    
    print(impressions_df.head())

else:
    print("Impressions is missing.")

    News ID  Clicked  User ID
0  ['N91737        0  U134050
0    N30206        0  U134050
0    N54368        0  U134050
0   N117802        0  U134050
0    N18190        0  U134050


In [4]:
print(f"Unique Users: {impressions_df['User ID'].nunique()}")
print(f"Unique News: {impressions_df['News ID'].nunique()}")

Unique Users: 255990
Unique News: 9988


There are alot of unique values, making the matrix incredibly large. I will make a sparse matrix instead.

## Creating matrix and applying filters

In [5]:
from scipy.sparse import coo_matrix

In [6]:
print(impressions_df.head())
print(impressions_df.dtypes)
print(impressions_df["User ID"].unique()[:10])  # Show first 10 unique User IDs
print(impressions_df["News ID"].unique()[:10])  # Show first 10 unique News IDs


    News ID  Clicked  User ID
0  ['N91737        0  U134050
0    N30206        0  U134050
0    N54368        0  U134050
0   N117802        0  U134050
0    N18190        0  U134050
News ID    object
Clicked     int32
User ID    object
dtype: object
['U134050' 'U254959' 'U499841' 'U107107' 'U492344' 'U657892' 'U441763'
 'U170615' 'U114779' 'U224919']
["['N91737" 'N30206' 'N54368' 'N117802' 'N18190' 'N122944' 'N69938'
 'N18356' 'N123209' 'N46894']


In [7]:
# Convert User ID and News ID to categorical codes (numeric indices)
impressions_df["User Index"] = impressions_df["User ID"].astype("category").cat.codes
impressions_df["News Index"] = impressions_df["News ID"].astype("category").cat.codes

In [8]:
print(impressions_df.head())

    News ID  Clicked  User ID  User Index  News Index
0  ['N91737        0  U134050       11254        9785
0    N30206        0  U134050       11254        2905
0    N54368        0  U134050       11254        4371
0   N117802        0  U134050       11254        1057
0    N18190        0  U134050       11254        2122


Then a User-Item interaction matrix will be made - first it will check for the necessary columns. A sparse matrix only stores entries where an item was interacted with, ignoring all those where the user didn't clikc on the article.

Interaction matrices: these matrices are tables that represent user behaviour - what each user has interacted with.

In [9]:
# Create User-Item Interaction matrix
if "User Index" in impressions_df.columns and "News Index" in impressions_df.columns:

    # Ensure there are no duplicate User ID and News ID pair
    impressions_df = impressions_df.groupby(["User Index", "News Index"])["Clicked"].max().reset_index()
    
    # Create sparse interaction matrix
    interaction_matrix = coo_matrix((impressions_df["Clicked"], (impressions_df["User Index"], impressions_df["News Index"])))

    print(f"Sparse Matrix Shape: {interaction_matrix.shape}")
    print(f"Non-zero interactions: {interaction_matrix.nnz}")  # Check sparsity

else:
    interaction_matrix = pd.DataFrame()
    print("Missing columns.")

Sparse Matrix Shape: (255990, 9988)
Non-zero interactions: 12722281


Collaborative filtering is applied, given that the matrix isn't empty. This uses SVD.

Collaborative filtering: finds similar users based on their history, and recommends similar items to similar users.

In [11]:
# Apply collaborative filtering
if interaction_matrix.nnz > 0:
    U, sigma, Vt = svds(interaction_matrix.astype(float), k=50) # Reduce dimensions
    sigma = np.diag(sigma)
    predicted_ratings = np.dot(np.dot(U, sigma), Vt)
    predicted_df = pd.DataFrame(predicted_ratings, index=interaction_matrix.index, columns=interaction_matrix.columns)
    print(predicted_df.head())

else:
    predicted_df = pd.DataFrame()

MemoryError: Unable to allocate 19.0 GiB for an array with shape (255990, 9988) and data type float64

Then, content-based filtering is applied. Here we will be using TF-IDF (cosine similarity), but this may be changed in the future - perhaps into a word embedding method.

Content-based filtering: recommends items by analyzing the attributes of items, and matching them with a user's preferences/past interactions. Measures similarity between items.

In [None]:
# Apply content-based filtering
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(news_df["Title"] + " " + news_df["Abstract"])
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Store results
news_similarity_df = pd.DataFrame(similarity_matrix, index=news_df["News ID"], columns=news_df["News ID"])

In [None]:
# Hybrid approach, combining scores
def hybrid_recommendations(user_id, top_n=5, alpha=0.5):
    if user_id not in predicted_df.index:
        return [] # No recommendations for uknown/new users
    
    # Normalize collaborative scores
    user_ratings = predicted_df.loc[user_id].copy()
    user_ratings = (user_ratings - user_ratings.min()) / (user_ratings.max() - user_ratings.min()) # Normalize

    # Compute content-based scores
    content_scores = news_similarity_df[user_ratings.index].dot(user_ratings.fillna(0))
    content_scores = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min()) # Normalize

    # Combination of both scores, using weights
    final_scores = alpha * user_ratings + (1 - alpha) * content_scores
    return final_scores.nlargest(top_n).index.tolist()

In [None]:
# Testing the recommender
if not interaction_matrix.empty:
    user_id = interaction_matrix.index[0] # Pick a sample user
    recommendations = hybrid_recommendations(user_id)
    print("Recommended articles: ", recommendations)

else:
    print("No valid interactions.")