Importing the required libraries

In [1]:
import pandas as pd
import networkx as nx
import random
import csv

Split original dataset into rows of source -> destination pairs

In [None]:
# seen = set()
# with open('train.txt','r') as lines:
#   i = 0
#   for line in csv.reader(lines,delimiter='\t'):
#     i += 1
#     og_num = line[0]
#     line.remove(og_num)
#     for i,number in enumerate(line,start=0):
#       combination = tuple([og_num,number])
#       seen.add(combination)
# print(seen)
# with open('individual_links.txt','w') as out:
#   for number in seen:
#     out.write(' '.join(str(s) for s in number) + '\n')

Generating NetworkX graph

In [2]:
train_csv = pd.read_csv('../data/individual_links.txt', names=['source', 'destination'],sep='\s+')
g = nx.from_pandas_edgelist(train_csv, source='source',target='destination',create_using=nx.DiGraph()) 
num_edges = g.number_of_edges()
num_nodes = g.number_of_nodes()
print(num_edges)
print(num_nodes)

23945602
4867136


In [7]:
num_test_edges = 50000   # No of positive/negative samples required

# Random Sampling

Sampling Positive Edges

In [None]:
edges_pos = random.sample(g.edges(), num_test_edges)

In [None]:
with open("../data/edges_pos_random_100k.csv","w") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source", "Target", "Label"])
    for edge in edges_pos:
        writer.writerow([edge[0], edge[1],1])

Sampling Negative Edges

In [None]:
i = 0
edges_neg = []
while i < num_test_edges:
    edge = random.sample(g.nodes(), 2)
    try:
        edge_exists = g.has_edge(edge[0],edge[1])
        if edge_exists == False:
            print(str(i))
            edges_neg.append([edge[0],edge[1]])
            i = i+1
    except Exception as e:
        print(e)
        pass

In [None]:
with open("../data/edges_neg_random_100k.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in edges_neg:
        writer.writerow([edge[0], edge[1], 0])

# Strategic Sampling

Sorting the nodes as a function of degree centrality

In [4]:
node_degree_dict=nx.degree_centrality(g)
sorted_node_degree_list = sorted(node_degree_dict.items(), key=lambda x: x[1], reverse=True)

N = 1000000
sample_sorted_node_degree_list = sorted_node_degree_list[0:N]

Sampling Positive Edges

In [9]:
i = 0
edges_pos = []
while i < num_test_edges:
    node = random.sample(sample_sorted_node_degree_list, 1)
    node = node[0][0]
    try:
        neighbors = list(g.neighbors(node))
        random_neighbor = random.sample(neighbors,1)[0]
        
        edges_pos.append([node,random_neighbor])
        i = i+1
    except Exception as e:
        pass

In [10]:
with open("../data/edges_pos_strategic_sampling_100k.csv","w") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source", "Target", "Label"])
    for edge in edges_pos:
        writer.writerow([edge[0], edge[1],1])

Sampling Negative Edges

In [11]:
i = 0
edges_neg = []
while i < num_test_edges:
    edge = random.sample(sample_sorted_node_degree_list, 2)
    source = edge[0][0]
    target = edge[1][0]

    try:
        edge_exists = g.has_edge(source,target)
        if edge_exists == False:
            edges_neg.append([source, target])
            i = i+1
    except Exception as e:
        print(e)
        pass

In [12]:
with open("../data/edges_neg_strategic_sampling_100k.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Source","Target", "Label"])
    for edge in edges_neg:
        writer.writerow([edge[0], edge[1], 0])