Imports + version check

In [8]:
import sys, pandas as pd, numpy as np, torch, networkx as nx, matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, normalize
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn as nn, torch.optim as optim

print("Python:", sys.version.split()[0])
print("Pandas:", pd.__version__)
print("PyTorch:", torch.__version__)


Python: 3.13.7
Pandas: 2.3.2
PyTorch: 2.8.0+cpu


Load your CSV and verify columns

In [9]:
import pandas as pd

csv_path = "YouTube_real_Dataset_Shuffled.csv"  # change path if your file is elsewhere
df_raw = pd.read_csv(csv_path)

print("Raw shape:", df_raw.shape)
print("Columns:", list(df_raw.columns))

required_cols = [
    'YouTuber Name', 'Category', 'Audience Country',
    'Subscribers', 'Avg Views', 'Avg Likes', 'Avg Comments'
]
missing = [c for c in required_cols if c not in df_raw.columns]
print("Missing required columns:", missing)

df_raw.head(3)


Raw shape: (458, 8)
Columns: ['YouTuber Name', 'Channel Name', 'Category', 'Subscribers', 'Audience Country', 'Avg Views', 'Avg Likes', 'Avg Comments']
Missing required columns: []


Unnamed: 0,YouTuber Name,Channel Name,Category,Subscribers,Audience Country,Avg Views,Avg Likes,Avg Comments
0,Home Decor Magic,Home Decor Magic,Lifestyle,59200,IN,87693.84,2840.42,22.64
1,Spice Food,Spice Food,food,536000,IN,19785.0,484.0,59.0
2,The Food Club,The Food Club,food,732000,KR,88212.0,411.0,25.0


Clean and encode the data

In [10]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Work on a clean copy
df_clean = df_raw.copy()

# Normalize text case
df_clean['Category'] = df_clean['Category'].astype(str).str.lower()

# Drop rows that could break the graph (NaNs in required columns)
needed_cols = [
    'YouTuber Name','Category','Audience Country',
    'Subscribers','Avg Views','Avg Likes','Avg Comments'
]
df_clean = df_clean.dropna(subset=needed_cols).reset_index(drop=True)

# Scale numeric columns to 0–1
num_cols = ['Subscribers','Avg Views','Avg Likes','Avg Comments']
scaler = MinMaxScaler()
df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])

# Encode category and audience country
cat_le  = LabelEncoder()
ctry_le = LabelEncoder()
df_clean['Category Encoded'] = cat_le.fit_transform(df_clean['Category'])
df_clean['Audience Country Encoded'] = ctry_le.fit_transform(df_clean['Audience Country'])

print("Clean shape:", df_clean.shape)
df_clean.head(5)


Clean shape: (449, 10)


Unnamed: 0,YouTuber Name,Channel Name,Category,Subscribers,Audience Country,Avg Views,Avg Likes,Avg Comments,Category Encoded,Audience Country Encoded
0,Home Decor Magic,Home Decor Magic,lifestyle,0.001316,IN,0.001964,0.000751,0.000141,1,11
1,Spice Food,Spice Food,food,0.011911,IN,0.000443,0.000128,0.000367,0,11
2,The Food Club,The Food Club,food,0.016267,KR,0.001976,0.000109,0.000155,0,14
3,mahuas daily vlog,mahuas daily vlog,lifestyle,4.7e-05,IN,4.5e-05,1.2e-05,2.7e-05,1,11
4,Tech Review 360,Tech Review 360,technology,6.2e-05,IN,0.000156,6.3e-05,0.000637,2,11


Build the Graph (category-based edges, aligned with df_clean)

In [11]:
import itertools
import torch
from torch_geometric.data import Data

# Number of nodes
n = len(df_clean)
edges = []

# Build fully connected subgraphs per category
for cat_val, group in df_clean.groupby('Category Encoded'):
    idx_list = group.index.to_list()
    for u, v in itertools.combinations(idx_list, 2):
        edges.append((u, v))
        edges.append((v, u))  # undirected (add both directions)

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

print(f"✅ Graph built successfully!")
print(f"Nodes: {n}")
print(f"Edges: {edge_index.shape[1]}")
print(f"Max node index: {edge_index.max().item()}")
print(f"Min node index: {edge_index.min().item()}")


✅ Graph built successfully!
Nodes: 449
Edges: 64136
Max node index: 448
Min node index: 0


✅ Step 5 — Create the feature tensor and PyTorch Geometric data object

In [12]:
import torch

# Select numeric columns as features
num_cols = ['Subscribers', 'Avg Views', 'Avg Likes', 'Avg Comments']
X = torch.tensor(df_clean[num_cols].values, dtype=torch.float)

# Verify alignment
print("Feature tensor shape:", X.shape)
print("Row count match:", len(df_clean) == X.shape[0])

# Build PyG Data object
data = Data(x=X, edge_index=edge_index)
print("\nPyG Data object:")
print(data)


Feature tensor shape: torch.Size([449, 4])
Row count match: True

PyG Data object:
Data(x=[449, 4], edge_index=[2, 64136])


✅ Step 6 — Define and Initialize the GraphSAGE Model

In [13]:
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import SAGEConv

# Define GraphSAGE model (2 layers)
class GraphSAGEModel(nn.Module):
    def __init__(self, in_dim=4, hid_dim=16, out_dim=8):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hid_dim)
        self.conv2 = SAGEConv(hid_dim, out_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x

# Initialize model
model = GraphSAGEModel(in_dim=4, hid_dim=16, out_dim=8)
print(model)


GraphSAGEModel(
  (conv1): SAGEConv(4, 16, aggr=mean)
  (conv2): SAGEConv(16, 8, aggr=mean)
  (dropout): Dropout(p=0.5, inplace=False)
)


✅ Step 7 — Train the GraphSAGE Model

In [14]:
import torch

epochs = 100
optimizer = optim.Adam(model.parameters(), lr=0.01)

for ep in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = torch.norm(out, p=2)  # simple embedding regularization
    loss.backward()
    optimizer.step()
    if ep % 10 == 0:
        print(f"Epoch {ep:03d} | Loss: {loss.item():.6f}")

torch.save(model.state_dict(), "gnn_sage_model.pth")
print("\n✅ Model training complete and saved as 'gnn_sage_model.pth'")


Epoch 000 | Loss: 19.140820
Epoch 010 | Loss: 6.386575
Epoch 020 | Loss: 3.114464
Epoch 030 | Loss: 1.448270
Epoch 040 | Loss: 0.943537
Epoch 050 | Loss: 0.491180
Epoch 060 | Loss: 0.499255
Epoch 070 | Loss: 0.328183
Epoch 080 | Loss: 0.425610
Epoch 090 | Loss: 0.461933

✅ Model training complete and saved as 'gnn_sage_model.pth'


✅ Step 8 — Generate Node Embeddings (from the trained GraphSAGE model)

In [15]:
model.eval()
with torch.no_grad():
    embeddings = model(data.x, data.edge_index).cpu().numpy()

print("✅ Embeddings generated successfully!")
print("Embeddings shape:", embeddings.shape)
print("First 3 embedding vectors:\n", embeddings[:3])


✅ Embeddings generated successfully!
Embeddings shape: (449, 8)
First 3 embedding vectors:
 [[-0.00279677 -0.00529433  0.00277727 -0.00320544 -0.00906994 -0.00373463
  -0.00556497  0.00479899]
 [-0.00738054 -0.00197927  0.00313058 -0.00202019 -0.00934129 -0.00393215
  -0.00903609  0.00402637]
 [-0.00737972 -0.00198167  0.00313306 -0.002023   -0.00934437 -0.00393693
  -0.00903148  0.00402105]]


✅ Step 9 — Compute Cosine Similarity Matrix

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
import pandas as pd

# Normalize the embeddings first
embeddings_norm = normalize(embeddings, axis=1)

# Compute cosine similarity across all influencers
similarity_matrix = cosine_similarity(embeddings_norm)

print("✅ Cosine similarity matrix computed!")
print("Shape:", similarity_matrix.shape)
print("Sample (first 3x3):\n", similarity_matrix[:3, :3])

# Optional: save for inspection
pd.DataFrame(
    similarity_matrix,
    index=df_clean['YouTuber Name'],
    columns=df_clean['YouTuber Name']
).to_csv("Influencer_Similarity_GraphSAGE.csv")
print("\n💾 Saved as Influencer_Similarity_GraphSAGE.csv")


✅ Cosine similarity matrix computed!
Shape: (449, 449)
Sample (first 3x3):
 [[1.        0.9124867 0.9125863]
 [0.9124867 1.0000001 0.9999999]
 [0.9125863 0.9999999 1.0000001]]

💾 Saved as Influencer_Similarity_GraphSAGE.csv


In [20]:
import pandas as pd
import numpy as np

# Reload your cleaned dataset
df_clean = pd.read_csv("YouTube_real_Dataset_Shuffled.csv")

# Normalize column names (if not yet)
df_clean.columns = df_clean.columns.str.strip()

# Ensure Category is lowercase
df_clean['Category'] = df_clean['Category'].astype(str).str.lower()

# Reload your pre-computed similarity matrix
similarity_matrix = pd.read_csv("Influencer_Similarity_GraphSAGE.csv", index_col=0).values

print("✅ Data reloaded:")
print("df_clean shape:", df_clean.shape)
print("similarity_matrix shape:", similarity_matrix.shape)


✅ Data reloaded:
df_clean shape: (458, 8)
similarity_matrix shape: (449, 449)


In [22]:
import pandas as pd
import numpy as np

# Reload and clean same way as during training
df_clean = pd.read_csv("YouTube_real_Dataset_Shuffled.csv")

# Drop rows with missing critical values
df_clean = df_clean.dropna(subset=['Audience Country', 'Subscribers', 'Avg Views', 'Avg Likes', 'Avg Comments']).reset_index(drop=True)

# Normalize text
df_clean['Category'] = df_clean['Category'].astype(str).str.lower()

# Verify shapes
print("✅ Cleaned data shape:", df_clean.shape)

# Reload similarity matrix
similarity_matrix = pd.read_csv("Influencer_Similarity_GraphSAGE.csv", index_col=0).values
print("✅ Similarity matrix shape:", similarity_matrix.shape)


✅ Cleaned data shape: (449, 8)
✅ Similarity matrix shape: (449, 449)


In [24]:
import numpy as np

all_top5 = {}

for cat in sorted(df_clean['Category'].unique()):
    print(f"\n🔥 Top 5 Influencers in category: {cat.upper()}")

    cat_idx = df_clean.index[df_clean['Category'] == cat].to_numpy()
    sub_sim = similarity_matrix[np.ix_(cat_idx, cat_idx)]
    sim_scores = sub_sim.sum(axis=1)

    cat_table = df_clean.loc[cat_idx, ['YouTuber Name', 'Subscribers', 'Avg Views', 'Avg Likes', 'Category']].copy()
    cat_table['Similarity Score'] = sim_scores

    top5 = cat_table.sort_values(by='Similarity Score', ascending=False).head(5).reset_index(drop=True)
    all_top5[cat] = top5
    display(top5)
    top5.to_csv(f"top5_{cat.lower()}.csv", index=False)

print("\n✅ Top-5 influencers computed and saved for each category.")



🔥 Top 5 Influencers in category: FOOD


Unnamed: 0,YouTuber Name,Subscribers,Avg Views,Avg Likes,Category,Similarity Score
0,Country foods,1540000,45363.0,595.0,food,93.97247
1,Flavours Of Food,1310000,2572.0,90.0,food,93.972405
2,Food Lovers TV,1190000,0.0,0.0,food,93.972346
3,Tiny Foodkey,1030000,28949.42,101.32,food,93.972247
4,BD Best Ever Food,1010000,75560.0,1171.0,food,93.972216



🔥 Top 5 Influencers in category: LIFESTYLE


Unnamed: 0,YouTuber Name,Subscribers,Avg Views,Avg Likes,Category,Similarity Score
0,Daily Routine,380000,15689.72,809.26,lifestyle,213.945969
1,Anita Ji Ki Duniya,588000,29523.14,1378.0,lifestyle,213.945965
2,Indian Mom On Duty,682000,7407.36,268.86,lifestyle,213.945961
3,Radhika Real Vlogs,451000,33031.88,1099.22,lifestyle,213.945958
4,India Grace,505000,34764.34,1372.06,lifestyle,213.945956



🔥 Top 5 Influencers in category: TECHNOLOGY


Unnamed: 0,YouTuber Name,Subscribers,Avg Views,Avg Likes,Category,Similarity Score
0,TutorialsPoint,3620000,2455.08,37.32,technology,72.975489
1,Geekyranjit,3300000,122236.44,3430.48,technology,72.975467
2,Geekyranjit,3300000,122235.16,3430.38,technology,72.975467
3,Gupta Information Systems,1750000,97631.08,3634.18,technology,72.975432
4,Trendy Tech Review,952000,1042.8,23.32,technology,72.975138



🔥 Top 5 Influencers in category: TRAVEL


Unnamed: 0,YouTuber Name,Subscribers,Avg Views,Avg Likes,Category,Similarity Score
0,Kuga's Travel,433000,268215.0,4439.0,travel,67.997624
1,My Travel Support,449000,304292.0,2915.0,travel,67.99762
2,The New Travel,372000,79215.0,4317.0,travel,67.99762
3,harry's vlogs,297000,84717.34,3279.16,travel,67.997576
4,Travel Forever : Вячеслав и Ирина Юмабаевы,152000,110354.0,3499.0,travel,67.997569



✅ Top-5 influencers computed and saved for each category.


✅ Step 10 — Get Top 5 Influencers per Category (using GraphSAGE embeddings, no function)

✅ Step 11 — Visualize Top-5 Influencers (bar chart per category)