In [22]:
import networkx as nx
import random

# Load the dataset into a NetworkX graph object
G = nx.read_edgelist('soc-redditHyperlinks-body.tsv', delimiter='\t', create_using=nx.DiGraph(), nodetype=str, data=[('POST_ID', str), ('TIMESTAMP', str), ('LINK_SENTIMENT', str), ('PROPERTIES', str)])

# Print the first edge and its attributes
print(list(G.edges(data=True))[0])

# Split the graph into training and test sets
nodes = list(G.nodes())
num_train = int(0.8 * len(nodes))
train_nodes = random.sample(nodes, num_train)
test_nodes = list(set(nodes) - set(train_nodes))
train_graph = G.subgraph(train_nodes)
test_graph = G.subgraph(test_nodes)

# Create a new undirected graph and copy the nodes and edges from the subgraph of the original graph
undirected_train_graph = nx.Graph()
for node in train_graph.nodes():
    undirected_train_graph.add_node(node)
for edge in train_graph.edges():
    undirected_train_graph.add_edge(edge[0], edge[1])

# Add nodes 'node1' and 'node2' to the undirected graph
undirected_train_graph.add_node('node1', attribute='attr_value_1')


('SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', {'POST_ID': 'POST_ID', 'TIMESTAMP': 'TIMESTAMP', 'LINK_SENTIMENT': 'LINK_SENTIMENT', 'PROPERTIES': 'PROPERTIES'})


In [49]:
import networkx as nx
import matplotlib.pyplot as plt

# Calculate some basic network statistics
num_nodes = len(train_graph.nodes())
num_edges = len(train_graph.edges())
density = nx.density(train_graph)
degree_centrality = nx.degree_centrality(train_graph)

# Print the results
print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Density: {density}")


Number of nodes: 28622
Number of edges: 85820
Density: 0.0001047619875149346


In [23]:
undirected_train_graph

<networkx.classes.graph.Graph at 0x21c7d6be280>

In [25]:

for u, v in train_graph.edges():
    common_successors = set(train_graph.successors(u)) & set(train_graph.successors(v))
    train_graph[u][v]['common_successors'] = len(common_successors)


In [27]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score

# Extract features from the train graph
train_features = []
train_labels = []
for u, v in train_graph.edges():
    num_common_neighbors = len(set(train_graph.neighbors(u)) & set(train_graph.neighbors(v)))
    similarity = 0.0  # compute similarity between node attributes
    distance = nx.shortest_path_length(train_graph, source=u, target=v)  # compute shortest path distance
    train_features.append([num_common_neighbors, similarity, distance])
    train_labels.append(train_graph[u][v]['LINK_SENTIMENT'])

# Train a logistic regression model
clf = LogisticRegression()
clf.fit(np.array(train_features), np.array(train_labels))

In [41]:
# Predict on the test graph and evaluate the model
test_features = []
test_labels = []
for u, v in test_graph.edges():
    num_common_neighbors = len(set(test_graph.neighbors(u)) & set(test_graph.neighbors(v)))
    similarity = 0.0  # compute similarity between node attributes
    distance = nx.shortest_path_length(test_graph, source=u, target=v)  # compute shortest path distance
    test_features.append([num_common_neighbors, similarity, distance])
    test_labels.append(1)

In [42]:
from sklearn.metrics import accuracy_score

In [43]:
# evaluate the model
y_pred = clf.predict(np.array(test_features))
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred, average='binary')
recall = recall_score(test_labels, y_pred, average='binary')
f1 = f1_score(test_labels, y_pred, average='binary')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0
