#IV: Making the Graph

#### Library Imports

In [1]:
!pip install -q ipython-sql sqlalchemy prettytable==3.8.0 networkx lightfm

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for lightfm (setup.py) ... [?25l[?25hdone


In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import json
from lightfm import LightFM
from lightfm.data import Dataset
from google.colab import files

#### Upload files (udf1 and rdf1)

In [3]:
print("Upload udf1.csv and rdf1.csv")
uploaded = files.upload()
udf1 = pd.read_csv('udf1.csv')
rdf1 = pd.read_csv('rdf1.csv')

Upload udf1.csv and rdf1.csv


Saving udf1.csv to udf1.csv
Saving rdf1.csv to rdf1.csv


### Data Preprocessing

In [4]:
# Clean udf1: Handle missing values
udf1 = udf1.fillna({
    'age': udf1['age'].median(),
    'gender': 'unknown',
    'city': 'unknown',
    'most_viewed_sport': 'unknown',
    'most_viewed_entity': 'unknown',
    'TSS': 0,
    'SLI': 0
})

# Normalize TSS and SLI
udf1['TSS_norm'] = (udf1['TSS'] - udf1['TSS'].min()) / (udf1['TSS'].max() - udf1['TSS'].min() + 1e-6)
udf1['SLI_norm'] = (udf1['SLI'] - udf1['SLI'].min()) / (udf1['SLI'].max() - udf1['SLI'].min() + 1e-6)

In [5]:
# Parse dictionary columns
udf1['sports_dict'] = udf1['sports_with_visit_count_dict'].apply(
    lambda x: json.loads(x.replace("'", '"')) if isinstance(x, str) else x
)
udf1['entity_dict'] = udf1['entity_with_visit_count_dict'].apply(
    lambda x: json.loads(x.replace("'", '"')) if isinstance(x, str) else x
)

In [6]:
# Clean rdf1: Remove duplicates and filter related posts
rdf1 = rdf1[rdf1['source'].str.startswith('related_post')].drop_duplicates(
    subset=['ppid', 'slug', 'timestamp']
)

# Handle missing values in rdf1
rdf1 = rdf1.fillna({'sport_from_slug': 'unknown', 'persons': '', 'orgs': ''})

# Debug: Check data shapes and duplicates
print(f"udf1 shape: {udf1.shape}")
print(f"rdf1 shape after deduplication: {rdf1.shape}")
print(f"Duplicate rows in rdf1: {rdf1.duplicated(subset=['ppid', 'slug', 'timestamp']).sum()}")

udf1 shape: (10000, 38)
rdf1 shape after deduplication: (2184, 12)
Duplicate rows in rdf1: 0


### Graph Building

In [7]:
# Initialize directed graph
G = nx.DiGraph()

# Add user nodes
for _, row in udf1.iterrows():
    G.add_node(row['ppid'], type='user', age=row['age'], gender=row['gender'],
               visit_count=row['visit_count'], timeonpage=row['timeonpage'],
               most_viewed_sport=row['most_viewed_sport'],
               most_viewed_entity=row['most_viewed_entity'],
               TSS=row['TSS'], SLI=row['SLI'])

#### Adding article nodes

In [8]:
# Add article nodes
for _, row in rdf1.iterrows():
    G.add_node(row['slug'], type='article', sport=row['sport_from_slug'],
               persons=row['persons'], orgs=row['orgs'])

# Add sport nodes
sports = set()
for d in udf1['sports_dict']:
    sports.update(d.keys())
sports.update(rdf1['sport_from_slug'].dropna())
for sport in sports:
    G.add_node(sport, type='sport')

# Add entity nodes
entities = set()
for d in udf1['entity_dict']:
    entities.update(d.keys())
for p in rdf1['persons'].dropna().str.split(',').explode().str.strip():
    entities.add(p)
for entity in entities:
    G.add_node(entity, type='entity')

#### Adding edges

In [9]:
# Add edges: User -> Article (CLICKS)
click_counts = rdf1.groupby(['ppid', 'slug']).size().reset_index(name='click_count')
for _, row in click_counts.iterrows():
    if row['ppid'] in G.nodes and row['slug'] in G.nodes:
        user_tss = udf1[udf1['ppid'] == row['ppid']]['TSS'].iloc[0]
        user_sli = udf1[udf1['ppid'] == row['ppid']]['SLI'].iloc[0]
        weight = row['click_count'] * (user_tss + user_sli) / 2
        G.add_edge(row['ppid'], row['slug'], type='clicks', weight=max(weight, 1))

# Add edges: User -> Sport (INTERESTED_IN)
for _, row in udf1.iterrows():
    for sport, count in row['sports_dict'].items():
        if sport in G.nodes:
            G.add_edge(row['ppid'], sport, type='interested_in', weight=count)

In [10]:
# Add edges: User -> Entity (INTERESTED_IN)
for _, row in udf1.iterrows():
    for entity, count in row['entity_dict'].items():
        if entity in G.nodes:
            G.add_edge(row['ppid'], entity, type='interested_in', weight=count)

# Add edges: Article -> Sport (BELONGS_TO)
for _, row in rdf1.iterrows():
    if row['sport_from_slug'] in G.nodes:
        G.add_edge(row['slug'], row['sport_from_slug'], type='belongs_to', weight=1)

# Add edges: Article -> Entity (FEATURES)
for _, row in rdf1.iterrows():
    if pd.notna(row['persons']):
        for person in row['persons'].split(','):
            person = person.strip()
            if person in G.nodes:
                G.add_edge(row['slug'], person, type='features', weight=1)

#### Mini Debugging

In [11]:
# Debug: Check graph size and connectivity
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
user_id = 'vtpks1740535964462a17db399a96d'
if user_id in G.nodes:
    print(f"Edges from user {user_id}: {list(G.out_edges(user_id, data=True))}")
else:
    print(f"User {user_id} not in graph")

Number of nodes: 10835
Number of edges: 52807
Edges from user vtpks1740535964462a17db399a96d: [('vtpks1740535964462a17db399a96d', 'nfl-active-news-after-ditching-mike-tomlins-steelers-justin-fields-celebrates-usd-forty-m-contract-in-style', {'type': 'clicks', 'weight': np.float64(1.4444444444444444)}), ('vtpks1740535964462a17db399a96d', 'nfl', {'type': 'interested_in', 'weight': 9}), ('vtpks1740535964462a17db399a96d', 'Others', {'type': 'interested_in', 'weight': 8}), ('vtpks1740535964462a17db399a96d', ' John Harbaugh', {'type': 'interested_in', 'weight': 1})]


#### Save the dataset

In [12]:
# Save edge list and node attributes
nx.write_edgelist(G, 'graph_edgelist.txt')
node_data = pd.DataFrame({node: attr for node, attr in G.nodes(data=True)}).T
node_data.to_csv('node_attributes.csv')

# V. Recommendation Algorithms

### 1. Graph Based Recommendation Algorithm -> Personalized PageRank Algorithm (PPR)

- Personalized Recommendations: PPR ranks articles by propagating importance from a specific user node, leveraging their clicks (```rdf1```) and preferences (```sports_with_visit_count_dict```, ```most_viewed_sport in udf1```) to suggest relevant articles for the Related Posts widget.

- Graph-Based Relevance: Utilizes the heterogeneous graph of users, articles, sports, and entities, with weighted edges (e.g., click counts, TSS/SLI-scaled weights) to capture user behavior and article relationships, ensuring robust recommendations.

- Efficient and Scalable: With a tuned damping factor (e.g., ```alpha=0.3```) and personalization boosts for user preferences, PPR efficiently handles the 10K-user dataset, avoiding zero scores and supporting real-time recommendations.

#### Generate recommendations

In [13]:
# Run PPR for recommendations
user_id = 'vtpks1740535964462a17db399a96d'
if user_id in G.nodes:
    personalization = {node: 1.0 if node == user_id else 0.0 for node in G.nodes}
    ppr_scores = nx.pagerank(G, alpha=0.3, personalization=personalization,
                             weight='weight', max_iter=100, tol=1e-6)
    article_scores = {node: score for node, score in ppr_scores.items()
                      if G.nodes[node]['type'] == 'article'}
    top_articles = sorted(article_scores.items(), key=lambda x: x[1], reverse=True)[:3]

    print(f"Top 3 Recommended Articles for User {user_id}:")
    for article, score in top_articles:
        sport = G.nodes[article]['sport']
        print(f"Article: {article}, Sport: {sport}")
else:
    print(f"User {user_id} not found in graph")

Top 3 Recommended Articles for User vtpks1740535964462a17db399a96d:
Article: nfl-active-news-after-ditching-mike-tomlins-steelers-justin-fields-celebrates-usd-forty-m-contract-in-style, Sport: NFL
Article: nfl-active-news-maxx-crosby-withdraws-antonio-pierce-support-as-tom-bradys-raiders-betray-hc-to-announce-firing, Sport: NFL
Article: mlb-baseball-news-blake-snell-reveals-his-one-wholesome-request-from-andrew-friedman-regarding-a-dodgers-legend, Sport: MLB


### Regular ML-Based Recommendation Algorithm -> LightFM

In [14]:
# Compute click counts
click_counts = rdf1.groupby(['ppid', 'slug']).size().reset_index(name='click_count')

In [15]:
# Debug: Check data and overlaps
print(f"udf1 shape: {udf1.shape}, Unique ppid: {udf1['ppid'].nunique()}")
print(f"rdf1 shape: {rdf1.shape}, Unique ppid: {rdf1['ppid'].nunique()}, Unique slug: {rdf1['slug'].nunique()}")
print(f"click_counts rows: {len(click_counts)}")
common_ppids = set(rdf1['ppid']).intersection(set(udf1['ppid']))
print(f"Common ppid between udf1 and rdf1: {len(common_ppids)}")
if len(common_ppids) == 0:
    print("Warning: No common ppid. Sample rdf1 ppid:")
    print(rdf1['ppid'].head())

udf1 shape: (10000, 38), Unique ppid: 8813
rdf1 shape: (2184, 12), Unique ppid: 740, Unique slug: 1260
click_counts rows: 1325
Common ppid between udf1 and rdf1: 740


#### Initialize LightFM Dataset

In [16]:
# Initialize LightFM Dataset
dataset = Dataset()

# User and item IDs
user_ids = udf1['ppid'].unique()
item_ids = rdf1['slug'].unique()

#### Setting up user features

In [17]:
# User features: age, TSS_norm, SLI_norm, sports_dict
all_sports = set()
for d in udf1['sports_dict']:
    all_sports.update(d.keys())
all_sports = sorted(list(all_sports))
user_features = []
for _, row in udf1.iterrows():
    features = [f"age:{row['age']}", f"TSS_norm:{row['TSS_norm']:.2f}", f"SLI_norm:{row['SLI_norm']:.2f}"]
    for sport in all_sports:
        count = row['sports_dict'].get(sport, 0)
        features.append(f"sport_{sport}:{np.log1p(count):.2f}")
    user_features.append(features)

#### Setting up item features

In [18]:
# Item features: sport_from_slug, has_orgs
item_features = []
for slug in item_ids:
    sport = rdf1[rdf1['slug'] == slug]['sport_from_slug'].iloc[0]
    has_orgs = 1 if rdf1[rdf1['slug'] == slug]['orgs'].iloc[0] != '' else 0
    item_features.append([f"sport:{sport}", f"has_orgs:{has_orgs}"])

#### Fitting algorithm

In [19]:
# Fit dataset
dataset.fit(
    users=user_ids,
    items=item_ids,
    user_features=[f for features in user_features for f in features],
    item_features=[f for features in item_features for f in features]
)

#### Connect nodes, edges/weights by building interactions

In [20]:
# Build interactions
interactions, weights = dataset.build_interactions(
    [(row['ppid'], row['slug'], row['click_count']) for _, row in click_counts.iterrows()
    if row['ppid'] in user_ids and row['slug'] in item_ids]
)

# Build feature matrices
user_feature_matrix = dataset.build_user_features(
    [(uid, features) for uid, features in zip(user_ids, user_features)]
)
item_feature_matrix = dataset.build_item_features(
    [(iid, features) for iid, features in zip(item_ids, item_features)]
)

#### Matrix debugging

In [21]:
# Debug: Check matrices
print(f"Interactions shape: {interactions.shape}")
print(f"User features shape: {user_feature_matrix.shape}")
print(f"Item features shape: {item_feature_matrix.shape}")
print(f"Non-zero interactions: {interactions.nnz}")
if interactions.nnz == 0:
    print("Warning: No valid interactions. Sample click_counts:")
    print(click_counts.head())

Interactions shape: (8813, 1260)
User features shape: (8813, 10101)
Item features shape: (1260, 1284)
Non-zero interactions: 1325


#### Train the model

In [22]:
# Train LightFM model
model = LightFM(loss='warp', no_components=30, learning_rate=0.05)
model.fit(
    interactions,
    user_features=user_feature_matrix,
    item_features=item_feature_matrix,
    epochs=30,
    num_threads=2,
    verbose=True
)

# Debug: Check model fit
print("Model training completed")

Epoch: 100%|██████████| 30/30 [00:00<00:00, 55.04it/s]

Model training completed





#### Generate recommendations

In [23]:
# Generate recommendations
user_id = 'vtpks1740535964462a17db399a96d'
if user_id in user_ids:
    user_idx = dataset.mapping()[0][user_id]
    scores = model.predict(user_idx, np.arange(len(item_ids)), user_features=user_feature_matrix, item_features=item_feature_matrix)
    top_indices = np.argsort(-scores)[:3]
    top_articles = [item_ids[i] for i in top_indices]
    top_scores = scores[top_indices]

    print(f"Top 3 Recommended Articles for User {user_id}:")
    for article, score in zip(top_articles, top_scores):
        sport = rdf1[rdf1['slug'] == article]['sport_from_slug'].iloc[0] if not rdf1[rdf1['slug'] == article].empty else 'unknown'
        print(f"Article: {article}, Sport: {sport}")
else:
    print(f"User {user_id} not found")

Top 3 Recommended Articles for User vtpks1740535964462a17db399a96d:
Article: nba-active-basketball-news-nba-reporter-trish-christakis-s-cbs-outfit-turns-heads-as-miami-heat-support-soars-even-after-disappointing-playoff-exit, Sport: NBA
Article: nfl-active-news-kyle-juszczyks-wife-kristin-shares-one-word-reaction-to-husbands-release-as-49ers-decision-breaks-christian-mccaffreys-heart, Sport: NFL
Article: ncaa-college-basketball-news-todd-golden-clears-stance-on-selfish-florida-locker-room-as-auburn-exposes-major-gators-problem, Sport: NCAA


#THE END (for now)