<a href="https://colab.research.google.com/github/NeuralDataMind/Engine-X/blob/main/hybride.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torch-geometric torch-scatter torch-sparse torch-cluster \
    sentence-transformers scikit-learn pandas numpy tqdm


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-cluster
  Downloading torch_cluster-1.6.3.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading tor

In [None]:
# Engine X - Hybrid Content + Product Recommendation using GNN
# Single-file pipeline (run locally).
# Assumes you have product_df and social_df saved as CSVs: 'product.csv', 'master_social_sample.csv'
# Key ideas implemented:
# 1. Preprocess product and social text, numeric features
# 2. Create content embeddings (SentenceTransformer)
# 3. Build a heterogeneous graph: product nodes + social nodes
# 4. Add edges: product-product (content similarity), product-social (tag match & similarity)
# 5. Train an unsupervised GraphSAGE encoder using link reconstruction (positive + negative sampling)
# 6. Produce final node embeddings and compute hybrid recommendations via weighted similarity

# NOTES:
# - Install required libs: torch, torch_geometric, sentence-transformers, scikit-learn, pandas, numpy
# - For large graphs, use NeighborSampler / mini-batching (PyG). For 1k products + 1k social it's small.
# - This script uses CPU/GPU depending on available device.

import os
import math
import random
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# PyG imports
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.utils import negative_sampling, train_test_split_edges

# Text / embedding
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ---------------------------
# 1) Load data (adjust paths)
# ---------------------------
product_path = '/content/master_product.csv'  # your 1000-row product file
social_path = '/content/mini_ms.csv'  # your 1000-row social sample (or subset)

product_df = pd.read_csv(product_path)
social_df = pd.read_csv(social_path)

print('product:', product_df.shape)
print('social:', social_df.shape)

# ---------------------------
# 2) Basic preprocessing
# ---------------------------
# Fill/normalize fields
product_df['tags'] = product_df['tags'].fillna('').astype(str)
product_df['product_name'] = product_df['product_name'].fillna('').astype(str)
product_df['brand'] = product_df['brand'].fillna('unknown').astype(str)

social_df['text'] = social_df['text'].fillna('').astype(str)
social_df['matched_tags'] = social_df['matched_tags'].fillna('').astype(str)

# Convert created_time if necessary
# If created_time is numeric epoch, convert to datetime
try:
    if np.issubdtype(social_df['created_time'].dtype, np.number):
        social_df['created_time'] = pd.to_datetime(social_df['created_time'], unit='s')
    else:
        social_df['created_time'] = pd.to_datetime(social_df['created_time'], errors='coerce')
except Exception:
    pass

# ---------------------------
# 3) Generate content embeddings
# ---------------------------
# Use a compact sentence-transformer model (user can replace with any model).
embed_model_name = 'all-MiniLM-L6-v2'  # small & fast
embedder = SentenceTransformer(embed_model_name)

# Compose product text (name + tags + brand + category)
product_texts = (
    product_df['product_name'].str.strip() + ' | ' +
    product_df['tags'].str.strip() + ' | ' +
    product_df['brand'].str.strip() + ' | ' +
    product_df['category'].str.strip()
).tolist()

social_texts = social_df['text'].tolist()

# Compute embeddings (convert to float32 tensors)
print('Computing product embeddings...')
product_emb = embedder.encode(product_texts, show_progress_bar=True, convert_to_numpy=True)
print('Computing social embeddings...')
social_emb = embedder.encode(social_texts, show_progress_bar=True, convert_to_numpy=True)

# Optionally reduce dimensionality using PCA/UMAP if needed for speed (not done here)

# ---------------------------
# 4) Build features for nodes
# ---------------------------
# Product numeric features
num_cols = ['mrp_inr', 'price_inr', 'discount_inr', 'discount_pct', 'rating', 'reviews_count', 'score']
product_num = product_df[num_cols].fillna(0).values.astype(np.float32)
scaler = StandardScaler()
product_num_scaled = scaler.fit_transform(product_num)

# Encode category/brand with LabelEncoder and one-hot or embedding index
le_brand = LabelEncoder()
product_brand_idx = le_brand.fit_transform(product_df['brand'].astype(str))
product_brand_idx = product_brand_idx.reshape(-1, 1).astype(np.float32)

# Final product feature vector: [content_emb || numeric || brand_idx]
product_feat = np.concatenate([product_emb.astype(np.float32), product_num_scaled, product_brand_idx], axis=1)

# Social node features: use social_emb and some simple numeric
social_time_delta = (pd.Timestamp.now() - social_df['created_time']).dt.total_seconds().fillna(0).values.reshape(-1,1).astype(np.float32)
# pad/concat
social_feat = np.concatenate([social_emb.astype(np.float32), social_time_delta], axis=1)

# Convert to torch tensors
product_x = torch.from_numpy(product_feat)
social_x = torch.from_numpy(social_feat)

# ---------------------------
# 5) Create edges
# ---------------------------
# We'll create three types of edges (undirected):
#  - product-product by content similarity (kNN)
#  - product-social by exact tag match
#  - product-social by text similarity (thresholded)

def build_pp_edges_from_similarity(embeddings, top_k=10, min_sim=0.6):
    # embeddings: numpy array
    sims = cosine_similarity(embeddings)
    edges = set()
    n = sims.shape[0]
    for i in range(n):
        # get top_k indices excluding self
        idxs = np.argsort(-sims[i])[: top_k + 1]
        for j in idxs:
            if i == j:
                continue
            if sims[i, j] >= min_sim:
                a, b = sorted((i, j))
                edges.add((a, b))
    return list(edges)

print('Building product-product edges...')
pp_edges = build_pp_edges_from_similarity(product_emb, top_k=20, min_sim=0.65)
print(f'pp edges: {len(pp_edges)}')

# product-social edges: tag exact matches
print('Building product-social tag-match edges...')
prod_tag_map = {}
for idx, tags in enumerate(product_df['tags'].astype(str)):
    for t in [tt.strip().lower() for tt in tags.split(',') if tt.strip()]:
        prod_tag_map.setdefault(t, []).append(idx)

ps_edges = set()
for s_idx, tags in enumerate(social_df['matched_tags'].astype(str)):
    for t in [tt.strip().lower() for tt in tags.split(',') if tt.strip()]:
        if t in prod_tag_map:
            for p_idx in prod_tag_map[t]:
                # product node ids will be 0..P-1, social nodes P..P+S-1
                ps_edges.add((p_idx, s_idx))

print(f'ps edges from tags: {len(ps_edges)}')

# product-social edges by embedding similarity (thresholded)
print('Building product-social similarity edges...')
ps_sim_edges = set()
ps_sim_matrix = cosine_similarity(product_emb, social_emb)
P, S = ps_sim_matrix.shape
threshold = 0.7
for i in range(P):
    # connect to top matches above threshold
    idxs = np.where(ps_sim_matrix[i] >= threshold)[0]
    for j in idxs:
        ps_sim_edges.add((i, j))

print(f'ps edges from sim: {len(ps_sim_edges)}')

# Convert to homogeneous edge_index for torch_geometric
# Node indexing: 0..P-1 -> product nodes; P..P+S-1 -> social nodes
P = product_x.size(0)
S = social_x.size(0)

edge_list = []
# add product-product edges (make bidirectional)
for a, b in pp_edges:
    edge_list.append((a, b))
    edge_list.append((b, a))

# add product-social tag edges
for p, s in ps_edges:
    edge_list.append((p, P + s))
    edge_list.append((P + s, p))

# add product-social sim edges
for p, s in ps_sim_edges:
    edge_list.append((p, P + s))
    edge_list.append((P + s, p))

edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
print('Total edges:', edge_index.size(1))

# Node features: stack product_x and social_x
# Ensure feature dims match by zero-padding the smaller tensor if needed
print(f'product_x.shape = {product_x.shape}, social_x.shape = {social_x.shape}')
if product_x.size(1) != social_x.size(1):
    max_dim = max(product_x.size(1), social_x.size(1))
    def pad_features(tensor, target_dim):
        pad_size = target_dim - tensor.size(1)
        if pad_size > 0:
            padding = torch.zeros((tensor.size(0), pad_size), dtype=tensor.dtype)
            return torch.cat([tensor, padding], dim=1)
        return tensor

    product_x = pad_features(product_x, max_dim)
    social_x  = pad_features(social_x, max_dim)
    print(f'Padded to match dims -> product_x.shape = {product_x.shape}, social_x.shape = {social_x.shape}')

# Now safe to combine
x = torch.cat([product_x, social_x], dim=0)
print(f'Merged node feature shape: {x.shape}')

# ---------------------------
# 6) Create PyG Data and train/test split for link prediction
# ---------------------------
data = Data(x=x, edge_index=edge_index)
# train_test_split_edges expects 'data' to be undirected and will create data.train_pos_edge_index etc.
data = train_test_split_edges(data, val_ratio=0.05, test_ratio=0.1)

# ---------------------------
# 7) GraphSAGE encoder + Link Predictor (dot product)
# ---------------------------
class GraphSAGEEncoder(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers=2, dropout=0.2):
        super().__init__()
        self.convs = nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.dropout = dropout

    def forward(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        return x

class LinkPredictor(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.lin = nn.Linear(emb_dim * 2, 1)

    def forward(self, z, edge_index):
        # edge_index: [2, E]
        src = z[edge_index[0]]
        dst = z[edge_index[1]]
        out = self.lin(torch.cat([src, dst], dim=1))
        return torch.sigmoid(out).squeeze(-1)

# ---------------------------
# 8) Training loop for link prediction
# ---------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

in_dim = data.num_features
emb_dim = 128
encoder = GraphSAGEEncoder(in_dim, emb_dim, num_layers=2, dropout=0.2).to(device)
predictor = LinkPredictor(emb_dim).to(device)

optimizer = torch.optim.Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=1e-3, weight_decay=1e-5)

def get_link_labels(pos_edge_index, neg_edge_index):
    E_pos = pos_edge_index.size(1)
    E_neg = neg_edge_index.size(1)
    labels = torch.cat([torch.ones(E_pos), torch.zeros(E_neg)], dim=0).to(device)
    return labels

x = data.x.to(device)
train_pos_edge_index = data.train_pos_edge_index.to(device)

# We'll sample negative edges each epoch using negative_sampling
epochs = 80
for epoch in range(1, epochs + 1):
    encoder.train()
    predictor.train()
    optimizer.zero_grad()

    z = encoder(x, data.train_pos_edge_index.to(device))

    # positive edges
    pos_edge = data.train_pos_edge_index.to(device)
    # negative edges
    neg_edge = negative_sampling(
        edge_index=data.train_pos_edge_index.to(device), num_nodes=data.num_nodes,
        num_neg_samples=pos_edge.size(1)
    ).to(device)

    # prepare edge_index for predictor (concatenate)
    edge_idx = torch.cat([pos_edge, neg_edge], dim=1)
    preds = predictor(z, edge_idx)
    labels = get_link_labels(pos_edge, neg_edge)

    loss = F.binary_cross_entropy(preds, labels)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        # evaluate on val
        encoder.eval()
        predictor.eval()
        with torch.no_grad():
            z = encoder(x, data.train_pos_edge_index.to(device))
            # val pos/neg
            val_pos = data.val_pos_edge_index.to(device)
            val_neg = data.val_neg_edge_index.to(device)
            val_edge = torch.cat([val_pos, val_neg], dim=1)
            val_preds = predictor(z, val_edge).cpu()
            val_labels = get_link_labels(val_pos, val_neg).cpu()
            from sklearn.metrics import roc_auc_score, average_precision_score
            try:
                auc = roc_auc_score(val_labels.numpy(), val_preds.numpy())
                ap = average_precision_score(val_labels.numpy(), val_preds.numpy())
            except Exception:
                auc, ap = 0.0, 0.0
        print(f'Epoch {epoch:03d} | Loss {loss.item():.4f} | Val AUC {auc:.4f} | Val AP {ap:.4f}')

# ---------------------------
# 9) Produce final embeddings & hybrid recommendations
# ---------------------------
encoder.eval()
with torch.no_grad():
    final_z = encoder(x.to(device), data.train_pos_edge_index.to(device))
    final_z = final_z.cpu().numpy()

# Split product and social embeddings
prod_z = final_z[:P]
soc_z = final_z[P:]

# Hybrid recommendation for a product p0:
#  - graph_sim: cosine similarity in GNN-embedding space between products
#  - content_sim: cosine similarity in original content embedding space (product_emb)
# final_score = alpha * graph_sim + (1-alpha) * content_sim

from sklearn.metrics.pairwise import cosine_similarity
prod_graph_sim = cosine_similarity(prod_z)
prod_content_sim = cosine_similarity(product_emb)

alpha = 0.6  # weight for graph vs content
hybrid_sim = alpha * prod_graph_sim + (1 - alpha) * prod_content_sim

# Recommendation function
import numpy as np

def recommend_for_product(prod_idx, top_k=10):
    sims = hybrid_sim[prod_idx]
    sims[prod_idx] = -1  # ignore self
    idxs = np.argsort(-sims)[:top_k]
    return idxs, sims[idxs]

# Example usage: recommend for product 0
rec_idxs, rec_scores = recommend_for_product(0, top_k=10)
print('Recommended product indices:', rec_idxs)
print('Scores:', rec_scores)

# Map indices back to product ids/names
for i, sc in zip(rec_idxs, rec_scores):
    print(f'Product: {product_df.iloc[i].product_id} | {product_df.iloc[i].product_name[:80]} | score {sc:.4f}')

# ---------------------------
# 10) Extra suggestions & improvements
# ---------------------------
# - For larger graphs: use torch_geometric.loader.NeighborSampler or ClusterData + ClusterLoader
# - Consider heterogeneous GNNs (HeteroData) if you want type-specific convs for products vs social
# - Try other objectives: (a) supervised ranking using known product co-purchases, (b) BPR loss
# - Add features: platform_type encoding, time-decay weighting for social mentions
# - Use faiss or Annoy for fast nearest neighbour search in production
# - Persist embeddings: save npy or to vector DB for online retrieval

# Save embeddings
np.save('prod_embeddings.npy', prod_z)
np.save('social_embeddings.npy', soc_z)
print('Saved embeddings to disk.')


product: (1000, 15)
social: (1000, 10)
Computing product embeddings...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Computing social embeddings...


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Building product-product edges...
pp edges: 10493
Building product-social tag-match edges...
ps edges from tags: 188
Building product-social similarity edges...
ps edges from sim: 0
Total edges: 21362
product_x.shape = torch.Size([1000, 392]), social_x.shape = torch.Size([1000, 385])
Padded to match dims -> product_x.shape = torch.Size([1000, 392]), social_x.shape = torch.Size([1000, 392])
Merged node feature shape: torch.Size([2000, 392])




Using device: cpu
Epoch 001 | Loss 35.8134 | Val AUC 0.1511 | Val AP 0.3327
Epoch 010 | Loss 24.5363 | Val AUC 0.6353 | Val AP 0.5192
Epoch 020 | Loss 14.9444 | Val AUC 0.8704 | Val AP 0.7826
Epoch 030 | Loss 6.1798 | Val AUC 0.8618 | Val AP 0.7621
Epoch 040 | Loss 4.1663 | Val AUC 0.8693 | Val AP 0.7728
Epoch 050 | Loss 3.4061 | Val AUC 0.8724 | Val AP 0.7763
Epoch 060 | Loss 3.2001 | Val AUC 0.8713 | Val AP 0.7755
Epoch 070 | Loss 2.7730 | Val AUC 0.8831 | Val AP 0.8005
Epoch 080 | Loss 2.4817 | Val AUC 0.8878 | Val AP 0.8098
Recommended product indices: [941 737 359 824   2 571 674 884  32 886]
Scores: [0.9421574  0.88597167 0.8806924  0.87051296 0.8697399  0.8668969
 0.8665118  0.86566365 0.86520004 0.8647807 ]
Product: c71f191c-2b89-4717-ac6b-63a2308e7ef6 | Bates Beverages | score 0.9422
Product: b57615a0-2e25-44dd-8803-56f8c70d7aa7 | Campos Beverages | score 0.8860
Product: ec0616f5-5b21-4e67-9987-96965755be70 | Day-Pace Beverages | score 0.8807
Product: 07aa8bf0-7280-43b5-905a-f