## Installing Required Packages

In [None]:
# !pip list -v | grep torch
# !pip uninstall torch torchvision torchaudio
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


## Importing Required Libraries and Packages

In [None]:
import networkx as nx
import requests
import zipfile
import io
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import json
import torch
import torch_geometric
from torch_geometric.data import Data
from sklearn import metrics

torch.manual_seed(596)

url = "https://snap.stanford.edu/data/facebook_large.zip"
response = requests.get(url)

with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:

    with zip_ref.open('facebook_large/musae_facebook_edges.csv') as edges_file:
        G = nx.read_edgelist(edges_file, delimiter=',')

    with zip_ref.open('facebook_large/musae_facebook_features.json') as features_file:
        features_data = json.load(features_file)

    with zip_ref.open('facebook_large/musae_facebook_target.csv') as target_file:
        target_df = pd.read_csv(target_file)

    #contents of the zip file
    print(zip_ref.namelist())

['facebook_large/', 'facebook_large/musae_facebook_edges.csv', 'facebook_large/musae_facebook_features.json', 'facebook_large/musae_facebook_target.csv', 'facebook_large/citing.txt', 'facebook_large/README.txt']


## ETL

### Compiling Information in a DataFrame

In [None]:
def get_features(id):
    return features_data.get(str(id), {})

target_df['features'] = target_df['id'].apply(get_features)

def get_edge_values(node):
    if node in G:
        return G[node]
    else:
        return {}

target_df['edge_values'] = target_df['id'].apply(lambda x: [int(num) for num in list(get_edge_values(str(x)))])

In [None]:
target_df.head()

Unnamed: 0,id,facebook_id,page_name,page_type,features,edge_values
0,0,145647315578475,The Voice of China 中国好声音,tvshow,"[3133, 3825, 236, 874, 1072, 143, 1078, 901]",[18427]
1,1,191483281412,U.S. Consulate General Mumbai,government,"[3399, 597, 979, 2014]","[21708, 22208, 22171, 6829, 16590, 20135, 8894..."
2,2,144761358898518,ESET,company,"[3383, 3832, 2035, 765, 3972, 3364, 663, 2163,...","[9048, 6353, 2629, 11537, 13205, 22304, 17728,..."
3,3,568700043198473,Consulate General of Switzerland in Montreal,government,"[2710, 1960, 1940, 4514, 4339, 761, 2263, 1340]","[16742, 293, 5826, 3479, 19753, 17346, 10945, ..."
4,4,1408935539376139,Mark Bailey MP - Labor for Miller,politician,"[2873, 4518, 4535, 1602, 3500, 4457, 1910]","[13645, 20876, 11446, 16203, 2830, 2004, 20624..."


### Converting to Tensors to Load Data in the Model

In [None]:
unique_values_set = set()

for values_list in features_data.values():
    unique_values_set.update(values_list)

num_features = len(list(unique_values_set))

print("Total number of features = ", num_features)

Total number of features =  4714


In [None]:
# Step 1: Convert node features and edge indices
# Initialize lists to store node features and edge indices
node_features = []
edge_index = [[], []]

# Extract node features and edge indices from the DataFrame
for _, row in target_df.iterrows():
    # Convert features to one-hot encoded vectors
    features = row['features']
    one_hot_features = [1 if i in features else 0 for i in range(num_features)]
    node_features.append(one_hot_features)

    # Extract edge values
    edges = row['edge_values']
    # Append edges to edge_index
    edge_index[0].extend([_]*len(edges))
    edge_index[1].extend(edges)

# Convert node_features and edge_index to tensors
x = torch.tensor(node_features, dtype=torch.float)
edge_index = torch.tensor(edge_index, dtype=torch.long)

# Step 2: Convert labels to tensor
labels = torch.tensor(target_df['page_type'].astype('category').cat.codes.values, dtype=torch.long)

# Create a PyTorch Geometric Data object
data = Data(x=x, edge_index=edge_index)

# Assign labels to the data object
data.y = labels

# Print the shapes to verify
print("Node features shape:", data.x.shape)
print("Edge index shape:", data.edge_index.shape)
print("Labels shape:", data.y.shape)

Node features shape: torch.Size([22470, 4714])
Edge index shape: torch.Size([2, 341825])
Labels shape: torch.Size([22470])


### Generating Train and Test Masks for Modelling

In [None]:
# Assuming data is your PyTorch Geometric Data object containing the graph data

# Number of nodes in the graph
num_nodes = data.num_nodes

# Define the proportion of nodes to be used for training
train_ratio = 0.8

# Generate random indices to split the nodes into training and testing sets
indices = np.random.permutation(num_nodes)
train_indices = indices[:int(train_ratio * num_nodes)]
test_indices = indices[int(train_ratio * num_nodes):]

# Initialize train_mask and test_mask tensors
data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)

# Assign True to the nodes in the train_mask and test_mask tensors based on the indices
data.train_mask[train_indices] = True
data.test_mask[test_indices] = True

## EDA

In [None]:
# Calculate degree distribution
degree_sequence = sorted([d for n, d in G.degree()], reverse=True)
degree_counts = pd.Series(degree_sequence).value_counts().sort_index()
degree_counts = degree_counts.reset_index()
degree_counts.columns = ['Degree', 'Count']

# Plot degree distribution
fig_degree = px.bar(degree_counts[:100], x='Degree', y='Count', title="Degree Distribution (Top 100)", text='Count', color_discrete_sequence = ["#ED625F"])

# Extract class information and calculate distribution
class_distribution = target_df['page_type'].value_counts().reset_index()
class_distribution.columns = ['Class', 'Count']

# Plot class distribution
fig_class = px.bar(class_distribution, x='Class', y='Count', title="Class Distribution", text='Count', color_discrete_sequence = ["#ED625F"])

# Flatten the list of features for each node
all_features = [feature for features_list in features_data.values() for feature in features_list]

# Count occurrences of each feature
feature_counts = pd.Series(all_features).value_counts().reset_index()
feature_counts.columns = ['Feature', 'Count']
feature_counts = feature_counts.sort_values(by='Count', ascending=False)
feature_counts['Feature'] = feature_counts['Feature'].astype('string')

# Plot features distribution
fig_features_top = px.bar(feature_counts[:20], x='Feature', y='Count', title="Features Distribution (Top 20)", text='Count', color_discrete_sequence = ["#ED625F"])
fig_features_bottom = px.bar(feature_counts[-20:], x='Feature', y='Count', title="Features Distribution (Bottom 20)", text='Count', color_discrete_sequence = ["#ED625F"])

# Show the plots
fig_degree.show()
fig_class.show()
fig_features_top.show()
fig_features_bottom.show()

In [None]:
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Average degree:", sum(dict(G.degree()).values()) / G.number_of_nodes())

# Clustering coefficient
print("Average Clustering Coefficient:", nx.average_clustering(G))

# Network density
print("Network Density:", nx.density(G))

# Centrality Measures
# degree_centrality = nx.degree_centrality(G)
# print("Degree centrality:", degree_centrality)
# betweenness_centrality = nx.betweenness_centrality(G)
# print("Betweenness centrality:", betweenness_centrality)
# eigenvector_centrality = nx.eigenvector_centrality(G)
# print("Eigenvector centrality:", eigenvector_centrality)

Number of nodes: 22472
Number of edges: 171003
Average degree: 15.219206123175507
Average Clustering Coefficient: 0.3597063658547231
Network Density: 0.0006772821024064575


### Considering a Subset of the Graph to Visualize

In [None]:
import random

# Define the size of the subset
subset_size = 100  # Adjust as needed

# Randomly select a subset of nodes
subset_nodes = random.sample(G.nodes(), subset_size)

# Create a subgraph containing only the subset of nodes
G_subset = G.subgraph(subset_nodes)

# Get the adjacency matrix for the subset
adj_matrix_subset = nx.adjacency_matrix(G_subset).todense()

# Plot adjacency matrix using Plotly heatmap
fig_adj_matrix_subset = go.Figure(data=go.Heatmap(z=adj_matrix_subset,
                                                  colorscale='Viridis'))

fig_adj_matrix_subset.update_layout(title='Adjacency Matrix (Subset)',
                                    xaxis_title='Nodes',
                                    yaxis_title='Nodes')

fig_adj_matrix_subset.show()



Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.



In [None]:
labels = {}
for index, row in target_df.iterrows():
    node_id = row['id']
    node_label = row['page_type']
    G.add_node(node_id)
    labels[node_id] = node_label

In [None]:
# Extract unique labels
unique_labels = set(labels.values())

# Assign a unique color to each label
label_colors = {}
for i, label in enumerate(unique_labels):
    label_colors[label] = plt.cm.tab10(i)

# Create a figure and axis object
fig, ax = plt.subplots(figsize=(10, 10))

# Generating nodes of the graph subset
nodes_to_remove = ['id_1', 'id_2']
G.remove_nodes_from(nodes_to_remove)
G_nodes = [int(x) for x in list(G.nodes())]

# Draw the graph
# pos = nx.spring_layout(G_subset)  # You can use any layout algorithm you prefer
nx.draw(G, ax=ax, with_labels=False, node_color=[label_colors[labels[node]] for node in G_nodes], alpha=0.4, node_size=2)
plt.show()

## ML Models

In [None]:
# Initialize lists to store node features and edge indices
node_features = []
edge_index = [[], []]

# Move data to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Convert PyTorch tensors to numpy arrays
X = data.x.cpu().numpy()
y = data.y.cpu().numpy()
train_mask = data.train_mask.cpu().numpy()
test_mask = data.test_mask.cpu().numpy()

# Split data into training and testing sets
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC()
}

# Train and evaluate each classifier
results = {}
for name, clf in classifiers.items():
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy

# Print results
for name, accuracy in results.items():
    print(f'{name}: Accuracy = {accuracy:.4f}')


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



## GCN Model

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
import ast
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

# Initialize model
model = GCN(num_node_features=x.size(1), num_classes=len(labels.unique())).to(device)

# Initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = torch.nn.functional.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
pred = model(data).argmax(dim=1)
print(metrics.classification_report(data.y[data.test_mask], pred[data.test_mask], digits=4))

              precision    recall  f1-score   support

           0     0.9442    0.9393    0.9418      1334
           1     0.9282    0.9589    0.9433      1362
           2     0.9503    0.9572    0.9537      1098
           3     0.9388    0.8771    0.9069       700

    accuracy                         0.9399      4494
   macro avg     0.9404    0.9331    0.9364      4494
weighted avg     0.9400    0.9399    0.9397      4494



## Graph SAGE Model

In [None]:
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

# Define the GraphSage model
class GraphSage(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GraphSage, self).__init__()
        self.conv1 = SAGEConv(num_node_features, 16)
        self.conv2 = SAGEConv(16, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

# Initialize the model
GraphSageModel = GraphSage(num_node_features=data.num_features, num_classes=len(labels.unique()))

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model and data to the device
GraphSageModel = GraphSageModel.to(device)
data = data.to(device)

# Initialize the optimizer
optimizer = torch.optim.Adam(GraphSageModel.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
# Training loop
for epoch in range(200):
    GraphSageModel.train()
    optimizer.zero_grad()
    out = GraphSageModel(data)
    # Compute the loss only for the nodes in the training set
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
# Evaluation
GraphSageModel.eval()
with torch.no_grad():
    pred = GraphSageModel(data).argmax(dim=1)
    print(metrics.classification_report(data.y[data.test_mask], pred[data.test_mask], digits=4))

              precision    recall  f1-score   support

           0     0.9644    0.9340    0.9490      1334
           1     0.9347    0.9670    0.9506      1362
           2     0.9584    0.9663    0.9624      1098
           3     0.9417    0.9229    0.9322       700

    accuracy                         0.9502      4494
   macro avg     0.9498    0.9475    0.9485      4494
weighted avg     0.9504    0.9502    0.9501      4494



In [None]:
# Define a function to extract node embeddings using GraphSAGE
def extract_node_embeddings(model, data):
    model.eval()
    with torch.no_grad():
        node_embeddings = model.conv1(data.x, data.edge_index)
    return node_embeddings

# Extract node embeddings using GraphSAGE
node_embeddings = extract_node_embeddings(GraphSageModel, data)

In [None]:
from torch_geometric.nn import GATConv

# Define GAT model
class GAT(torch.nn.Module):
    def __init__(self, num_features, num_classes, num_heads=8, hidden_units=8):
        super(GAT, self).__init__()
        self.conv1 = GATConv(num_features, hidden_units, heads=num_heads, dropout=0.6)
        self.conv2 = GATConv(hidden_units * num_heads, num_classes, heads=1, concat=False,
                             dropout=0.6)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Assuming data is your PyTorch Geometric Data object containing the graph data
num_features = node_embeddings.size(1)

# Initialize GAT model
GATmodel = GAT(num_features=num_features, num_classes=len(labels.unique())).to(device)

# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move the model, embeddings and data to the device
GATmodel = GATmodel.to(device)
node_embeddings = node_embeddings.to(device)
data = data.to(device)

# Define optimizer
optimizer = torch.optim.Adam(GATmodel.parameters(), lr=0.005, weight_decay=5e-4)

In [None]:
GATmodel.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = GATmodel(node_embeddings, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
# Evaluation
GATmodel.eval()
with torch.no_grad():
    pred = GATmodel(node_embeddings, data.edge_index).argmax(dim=1)
    print(metrics.classification_report(data.y[data.test_mask], pred[data.test_mask], digits=4))

              precision    recall  f1-score   support

           0     0.9542    0.9363    0.9451      1334
           1     0.9274    0.9567    0.9418      1362
           2     0.9465    0.9499    0.9482      1098
           3     0.9395    0.9100    0.9245       700

    accuracy                         0.9417      4494
   macro avg     0.9419    0.9382    0.9399      4494
weighted avg     0.9419    0.9417    0.9417      4494

