In [2]:
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from pyvis.network import Network
import gravis as gv
from sklearn.cluster import KMeans
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
from torch_geometric.nn import GraphConv


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
layer0 = pd.read_csv('/data/peng/MyProject/STAT_402_Project-main22/Data_Scraping/Data_collecting_v_2/layer/0_layer.csv')
layer1 = pd.read_csv('/data/peng/MyProject/STAT_402_Project-main22/Data_Scraping/Data_collecting_v_2/layer/1_layer.csv')

In [4]:
G1 = nx.DiGraph()

for df in [layer0, layer1]:
    for index, row in df.iterrows():
        aid = row['aid']
        node = row['node']
        title_length = len(row['title'])
        ctime_length = row['ctime']
        duration_length = row['duration']
        tid = row['tid']
        G1.add_node(aid, title_length=title_length, ctime_length=ctime_length, duration_length=duration_length, tid=tid)
        G1.add_edge(node, aid)
        
# Re-index nodes
node_map = {node: i for i, node in enumerate(G1.nodes())}
G1 = nx.relabel_nodes(G1, node_map, copy=True)


node_features = [(data.get('title_length', 0), data.get('ctime_length', 0), 
                  data.get('duration_length', 0), data.get('tid', 0)) 
                 for node, data in G1.nodes(data=True)]


# Convert edge list to re-indexed node indices
edge_index = torch.tensor([(node_map[u], node_map[v]) for u, v in G1.edges() if u in node_map and v in node_map], dtype=torch.long).t().contiguous()

# Sort node features according to the new node indices
sorted_node_features = [node_features[node_map[node]] for node in G1.nodes() if node in node_map]
x = torch.tensor(sorted_node_features, dtype=torch.float)


# Add self-loops to the adjacency matrix
edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

# Create the torch_geometric.data.Data object
data = Data(x=x, edge_index=edge_index)


In [5]:
pagerank = nx.pagerank(G1, alpha=0.85)

In [6]:
class GNN(nn.Module):
    def __init__(self, num_node_features, embedding_size):
        super(GNN, self).__init__()
        self.embedding_size = embedding_size
        self.conv1 = GraphConv(num_node_features, embedding_size)
        self.conv2 = GraphConv(embedding_size, embedding_size)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x
for node in G1.nodes():
    if node in pagerank:
        G1.nodes[node]['pagerank'] = pagerank[node]
    else:
        G1.nodes[node]['pagerank'] = 0.0


In [7]:
node_features = []
for node in G1.nodes():
    features = [0.0] * 64  
    features[0] = G1.nodes[node].get('title_length', 0)  
    features[1] = G1.nodes[node].get('ctime_length', 0)  
    features[2] = G1.nodes[node].get('duration_length', 0)  
    features[3] = G1.nodes[node].get('tid', 0)  
    features[4] = G1.nodes[node].get('pagerank', 0)  

    node_features.append(features)
x = torch.tensor(node_features, dtype=torch.float)
edge_index = torch.tensor(list(G1.edges()), dtype=torch.long).t().contiguous()
edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
data = Data(x=x, edge_index=edge_index)


In [8]:
embedding_size = data.x.size(1) 
model = GNN(data.x.size(1), embedding_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(100):
    optimizer.zero_grad()
    embeddings = model(data.x, data.edge_index)
    loss = criterion(embeddings, data.x)
    loss.backward()
    optimizer.step()
    print('Epoch: {}, Loss: {:.4f}'.format(epoch, loss.item()))


Epoch: 0, Loss: 3207403236449320960.0000
Epoch: 1, Loss: 1040825057204502528.0000
Epoch: 2, Loss: 462139683640442880.0000
Epoch: 3, Loss: 497914940149989376.0000
Epoch: 4, Loss: 514877140391428096.0000
Epoch: 5, Loss: 415504310265184256.0000
Epoch: 6, Loss: 288123654804340736.0000
Epoch: 7, Loss: 191509602431401984.0000
Epoch: 8, Loss: 137393358099185664.0000
Epoch: 9, Loss: 113656989409607680.0000
Epoch: 10, Loss: 103803475439124480.0000
Epoch: 11, Loss: 99234583718920192.0000
Epoch: 12, Loss: 94527394051784704.0000
Epoch: 13, Loss: 87350288951803904.0000
Epoch: 14, Loss: 78439820950503424.0000
Epoch: 15, Loss: 69799940183818240.0000
Epoch: 16, Loss: 62132006221250560.0000
Epoch: 17, Loss: 55789537820934144.0000
Epoch: 18, Loss: 50834369627029504.0000
Epoch: 19, Loss: 47321253882626048.0000
Epoch: 20, Loss: 45080951696392192.0000
Epoch: 21, Loss: 43797740022398976.0000
Epoch: 22, Loss: 43072852622049280.0000
Epoch: 23, Loss: 42593955178610688.0000
Epoch: 24, Loss: 42197564056928256.00

In [9]:
embeddings = model(data.x, data.edge_index)
final_embeddings = embeddings.detach().numpy()

In [10]:
def shortest_path_between_tags(tag1, tag2):
    center1 = None
    center2 = None
    for node, attributes in G1.nodes(data=True):
        if attributes['tid'] == tag1:
            center1 = node
        elif attributes['tid'] == tag2:
            center2 = node
        if center1 is not None and center2 is not None:
            break     
            
    if center1 is None or center2 is None:
        raise ValueError("One or both of the tags are not found in the graph.")

    try:
        shortest_path = nx.shortest_path(G1, source=center1, target=center2)
    except nx.NetworkXNoPath:
#         raise ValueError("No shortest path found between the centers of the tags.")
        shortest_path=0
    if not shortest_path:
        return "No path exists between the tags."
  
    path_nodes = []
    for node in shortest_path:
        node_features = G1.nodes[node]
        path_nodes.append((node, node_features))

   
    return path_nodes


In [11]:
missing_tid_nodes = [node for node, data in G1.nodes(data=True) if 'tid' not in data]
print("Nodes missing 'tid' attribute:", missing_tid_nodes)
G1.remove_nodes_from(missing_tid_nodes)

Nodes missing 'tid' attribute: [1142, 1359]


In [12]:
tag_embeddings = {}

for node, embedding in zip(G1.nodes(), final_embeddings):
    tag = G1.nodes[node].get('tid', 0)  # default value is 0
    if tag not in tag_embeddings:
        tag_embeddings[tag] = []
    tag_embeddings[tag].append(embedding)

tag_centers = {}
for tag, embeddings in tag_embeddings.items():
    tag_centers[tag] = np.mean(embeddings, axis=0) 

# for tag, center in tag_centers.items():
#     print("Tag:", tag)
#     print("Center:", center)



In [13]:
def all_pairs_shortest_paths(tag_ids):
    paths = {}
    tags = list(tag_ids)  # Convert set to list
    for i in range(len(tags)):
        for j in range(i+1, len(tags)):
            path = shortest_path_between_tags(tags[i], tags[j])
            if path is not None and path != "No path exists between the tags.":
                paths[(tags[i], tags[j])] = (path, len(path) - 1)
    return paths


In [14]:
tag_ids = set(nx.get_node_attributes(G1, 'tid').values())
paths = all_pairs_shortest_paths(tag_ids)
for pair, (path, length) in paths.items():
    print(f"Path between {pair[0]} and {pair[1]}: {path}, length: {length}")


Path between 17 and 21: [(35, {'title_length': 7, 'ctime_length': 1680257661, 'duration_length': 86, 'tid': 17, 'pagerank': 2.8631603533541615e-05}), (86, {'title_length': 9, 'ctime_length': 1680322130, 'duration_length': 42, 'tid': 138, 'pagerank': 3.3577201950950124e-05}), (96, {'title_length': 11, 'ctime_length': 1647586992, 'duration_length': 72, 'tid': 138, 'pagerank': 3.0486202940069805e-05}), (131, {'title_length': 8, 'ctime_length': 1650023108, 'duration_length': 112, 'tid': 210, 'pagerank': 3.5843934558929026e-05}), (122, {'title_length': 7, 'ctime_length': 1680697949, 'duration_length': 114, 'tid': 138, 'pagerank': 3.9759199972710764e-05}), (147, {'title_length': 8, 'ctime_length': 1680663912, 'duration_length': 97, 'tid': 212, 'pagerank': 3.089833614152052e-05}), (39, {'title_length': 18, 'ctime_length': 1680635668, 'duration_length': 62, 'tid': 21, 'pagerank': 3.516679122107637e-05})], length: 6
Path between 17 and 75: [(35, {'title_length': 7, 'ctime_length': 1680257661, '

In [20]:
from torch_geometric.nn import GCNConv
import numpy as np
import pandas as pd
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing, GraphConv
from torch_geometric.utils import add_self_loops, degree
from sklearn.metrics.pairwise import cosine_similarity
from torch_geometric.utils import negative_sampling


In [21]:
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x

def contrastive_loss(embeddings, edge_index):
    pos_a = embeddings[edge_index[0]]
    pos_b = embeddings[edge_index[1]]
    pos_sim = F.cosine_similarity(pos_a, pos_b, dim=-1)
    neg_index = negative_sampling(edge_index, num_nodes=embeddings.size(0))
    neg_a = embeddings[neg_index[0]]
    neg_b = embeddings[neg_index[1]]
    neg_sim = F.cosine_similarity(neg_a, neg_b, dim=-1)
    pos_loss = F.binary_cross_entropy_with_logits(pos_sim, torch.ones_like(pos_sim))
    neg_loss = F.binary_cross_entropy_with_logits(neg_sim, torch.zeros_like(neg_sim))
    
    return pos_loss + neg_loss

model = GNN(data.num_features, 64)
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    embeddings = model(data.x, data.edge_index)
    loss = contrastive_loss(embeddings, data.edge_index)
    loss.backward()
    optimizer.step()
    print('Epoch: {}, Loss: {:.4f}'.format(epoch, loss.item()))


Epoch: 0, Loss: 1.4053
Epoch: 1, Loss: 1.2968
Epoch: 2, Loss: 1.2162
Epoch: 3, Loss: 1.1863
Epoch: 4, Loss: 1.1854
Epoch: 5, Loss: 1.1895
Epoch: 6, Loss: 1.1930
Epoch: 7, Loss: 1.1923
Epoch: 8, Loss: 1.1903
Epoch: 9, Loss: 1.1850
Epoch: 10, Loss: 1.1800
Epoch: 11, Loss: 1.1754
Epoch: 12, Loss: 1.1690
Epoch: 13, Loss: 1.1695
Epoch: 14, Loss: 1.1695
Epoch: 15, Loss: 1.1724
Epoch: 16, Loss: 1.1698
Epoch: 17, Loss: 1.1707
Epoch: 18, Loss: 1.1720
Epoch: 19, Loss: 1.1705
Epoch: 20, Loss: 1.1679
Epoch: 21, Loss: 1.1666
Epoch: 22, Loss: 1.1683
Epoch: 23, Loss: 1.1648
Epoch: 24, Loss: 1.1674
Epoch: 25, Loss: 1.1658
Epoch: 26, Loss: 1.1680
Epoch: 27, Loss: 1.1658
Epoch: 28, Loss: 1.1650
Epoch: 29, Loss: 1.1673
Epoch: 30, Loss: 1.1664
Epoch: 31, Loss: 1.1642
Epoch: 32, Loss: 1.1648
Epoch: 33, Loss: 1.1663
Epoch: 34, Loss: 1.1655
Epoch: 35, Loss: 1.1657
Epoch: 36, Loss: 1.1663
Epoch: 37, Loss: 1.1668
Epoch: 38, Loss: 1.1639
Epoch: 39, Loss: 1.1636
Epoch: 40, Loss: 1.1655
Epoch: 41, Loss: 1.1665
Ep

In [44]:
def recommend_videos(model, data, target_tag, initial_video, num_recommendations=10):
    model.eval()
    with torch.no_grad():
        all_embeddings = model(data.x, data.edge_index)

#     initial_video = node_map[initial_video]
    target_videos = [v for v, data in G1.nodes(data=True) if data.get('tid') == target_tag]
    initial_embedding = all_embeddings[initial_video].unsqueeze(0)
    target_embeddings = all_embeddings[target_videos]
    distances = torch.norm(target_embeddings - initial_embedding, dim=1)
    recommendations = torch.argsort(distances)[:num_recommendations]
    return [target_videos[i] for i in recommendations]


In [52]:
# recommendations = recommend_videos(model, data, target_tag= 17, initial_video=311795251, num_recommendations=10)
# print(recommendations)


In [53]:
recommendations = recommend_videos(model, data, target_tag= 17, initial_video=30473, num_recommendations=10)
print(recommendations)


[30473, 30474, 26644, 1338, 4884, 499, 30490, 29275, 28196, 31005]


In [6]:
recommendations = [30473, 30474, 26644, 1338, 4884, 499, 30490, 29275, 28196, 31005]

In [7]:
aids = [node2aid[node2aid['Node']==recommendation].loc[:,['ID']].values[0][0] for recommendation in recommendations]

In [9]:
print(aids)

[908080221, 524707665, 484179696, 227245796, 312135400, 951984290, 226103666, 781145006, 676251373, 481789111]
