<a href="https://colab.research.google.com/github/NeuralDataMind/Engine-X/blob/main/GraphFusionPhase2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torch-geometric torch-scatter torch-sparse torch-cluster \
    sentence-transformers scikit-learn pandas numpy tqdm


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.0/210.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-cluster
  Downloading torch_cluster-1.6.3.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading tor

In [5]:
# engine_x_phase2_homo.py
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# ---------------------------
# 1) Load product and social data
# ---------------------------
product_df = pd.read_csv('/content/master_product.csv')
social_df = pd.read_csv('/content/mini_ms.csv')

# ---------------------------
# 2) Preprocess text
# ---------------------------
product_df['tags'] = product_df['tags'].fillna('').astype(str)
product_df['product_name'] = product_df['product_name'].fillna('').astype(str)
product_df['brand'] = product_df['brand'].fillna('unknown').astype(str)

social_df['text'] = social_df['text'].fillna('').astype(str)
social_df['matched_tags'] = social_df['matched_tags'].fillna('').astype(str)

# ---------------------------
# 3) Compute embeddings
# ---------------------------
embedder = SentenceTransformer('all-MiniLM-L6-v2')

product_texts = (product_df['product_name'] + ' | ' +
                 product_df['tags'] + ' | ' +
                 product_df['brand'] + ' | ' +
                 product_df['category']).tolist()
product_emb = embedder.encode(product_texts, convert_to_numpy=True)

social_texts = social_df['text'].tolist()
social_emb = embedder.encode(social_texts, convert_to_numpy=True)

# ---------------------------
# 4) Build node features
# ---------------------------
num_cols = ['mrp_inr','price_inr','discount_inr','discount_pct','rating','reviews_count','score']
product_num = StandardScaler().fit_transform(product_df[num_cols].fillna(0).values)
brand_idx = LabelEncoder().fit_transform(product_df['brand']).reshape(-1,1)
product_x = torch.from_numpy(np.concatenate([product_emb, product_num, brand_idx], axis=1).astype(np.float32))

social_time = (pd.Timestamp.now() - pd.to_datetime(social_df['created_time'], errors='coerce')).dt.total_seconds().fillna(0).values.reshape(-1,1)
social_x = torch.from_numpy(np.concatenate([social_emb, social_time], axis=1).astype(np.float32))

# Simulate users
num_users = 500
user_x = torch.rand(num_users, 64)

# Pad social and user features to match product feature dim
feat_dim = product_x.shape[1]
social_x = F.pad(social_x, (0, feat_dim - social_x.shape[1]))
user_x = F.pad(user_x, (0, feat_dim - user_x.shape[1]))

# Merge all nodes
x = torch.cat([product_x, social_x, user_x], dim=0)
num_product, num_social, num_user = product_x.size(0), social_x.size(0), user_x.size(0)

# ---------------------------
# 5) Build edges
# ---------------------------
# Product-product similarity
sims = cosine_similarity(product_emb)
edges = [(i,j) for i in range(num_product) for j in range(num_product) if i!=j and sims[i,j]>0.65]

# Product-social tag edges
prod_tag_map = {}
for i, tags in enumerate(product_df['tags']):
    for t in tags.lower().split(','):
        if t.strip(): prod_tag_map.setdefault(t.strip(), []).append(i)

for j, tags in enumerate(social_df['matched_tags']):
    for t in tags.lower().split(','):
        if t.strip() and t.strip() in prod_tag_map:
            for i in prod_tag_map[t.strip()]:
                edges.append((i, num_product+j))

# User-product interactions
interactions = []
for u in range(num_users):
    for p in np.random.choice(num_product, size=5, replace=False):
        edges.append((num_product+num_social+u, p))

# Convert edge list to tensor
edge_index = torch.tensor([[i for i,j in edges]+[j for i,j in edges],
                           [j for i,j in edges]+[i for i,j in edges]], dtype=torch.long)

# ---------------------------
# 6) Homogeneous GraphSAGE
# ---------------------------
class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GraphSAGE(feat_dim, 128).to(device)
x, edge_index = x.to(device), edge_index.to(device)

model.eval()
with torch.no_grad():
    z = model(x, edge_index)

# Split embeddings
prod_emb_out = z[:num_product].cpu().numpy()
social_emb_out = z[num_product:num_product+num_social].cpu().numpy()
user_emb_out = z[num_product+num_social:].cpu().numpy()

# ---------------------------
# 7) Recommendation example
# ---------------------------
from sklearn.metrics.pairwise import cosine_similarity

def recommend_for_user(user_idx, top_k=5):
    sims = cosine_similarity(user_emb_out[user_idx].reshape(1,-1), prod_emb_out).flatten()
    idxs = sims.argsort()[::-1][:top_k]
    return idxs, sims[idxs]

idxs, scores = recommend_for_user(42)
print("Top products for user 42:", idxs)
for i, sc in zip(idxs, scores):
    print(f"Product: {product_df.iloc[i].product_name} | score {sc:.4f}")


Top products for user 42: [602 246  60 486 702]
Product: Adams Restaurants | score 1.0000
Product: Adams Fast Food | score 1.0000
Product: Jackson, Desserts | score 1.0000
Product: Adams-Smith Bedding | score 0.8172
Product: Smith, Appliances | score 0.7926


In [8]:
# ---------------------------
# 8) Save model & embeddings
# ---------------------------
# Save PyTorch model
torch.save(model.state_dict(), 'graphsage_homo_phase2.pth')
print("Saved model weights to 'graphsage_homo_phase2.pth'.")

# Save embeddings
np.save('product_embeddings.npy', prod_emb_out)
np.save('social_embeddings.npy', social_emb_out)
np.save('user_embeddings.npy', user_emb_out)
print("Saved product, social, and user embeddings to disk.")


Saved model weights to 'graphsage_homo_phase2.pth'.
Saved product, social, and user embeddings to disk.
