In [1]:
import csv
import networkx as nx
import numpy as np
import pandas as pd
# from tqdm import tqdm
from tqdm.notebook import tqdm
from feature_extraction import feature_extraction_pos
from feature_extraction import feature_extraction_neg
# from train import train_model
# from test import test_model
from graph_sample import preferential_uniform_random_node_sampling
from graph_sample import uniform_random_node_sampling

In [2]:
# Initialize directed graph
G = nx.DiGraph()

# Open the file in read mode and construct the graph
with open('train.csv', 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        src = int(row[0])
        # Adding edges from source to all destinations in the row
        for dest in row[1:]:
            G.add_edge(src, int(dest))

In [3]:
print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())

Number of nodes:  4867136
Number of edges:  23945602


In [4]:
import random
all_edges = list(G.edges())
all_nodes = set(G.nodes())
# num_edges_to_sample = int(len(all_edges) * fraction)
num_edges_to_sample = 50000
sampled_edges = random.sample(all_edges, num_edges_to_sample)
num_neg_examples = num_edges_to_sample

In [5]:
G_subgraph = nx.DiGraph()
G_subgraph.add_edges_from(sampled_edges)
print("Number of nodes in subgraph: ", G_subgraph.number_of_nodes())
print("Number of edges in subgraph: ", G_subgraph.number_of_edges())

Number of nodes in subgraph:  52261
Number of edges in subgraph:  50000


In [6]:
# num_edges_to_sample = 25000
# test_path = 'test.csv'
# G_subgraph = uniform_random_node_sampling(G, num_edges_to_sample)

In [7]:
# print("Number of nodes in the subgraph: ", G_subgraph.number_of_nodes())
# print("Number of edges in the subgraph: ", G_subgraph.number_of_edges())

In [8]:
# # save the subgraph
nx.write_edgelist(G_subgraph, "subgraph_v10.txt", data=False)

# load the subgraph
# G_subgraph = nx.read_edgelist("subgraph_v4.txt", create_using=nx.DiGraph(), nodetype=int)
# print("Number of nodes in the subgraph: ", G_subgraph.number_of_nodes())
# print("Number of edges in the subgraph: ", G_subgraph.number_of_edges())

In [9]:
data=[]
neg_examples=[]

data = feature_extraction_pos(G_subgraph, G_subgraph.edges())
neg_examples = feature_extraction_neg(G_subgraph, num_edges_to_sample, G_subgraph.nodes())
data.extend(neg_examples)

print(data[:5])

Processing edges:   0%|          | 0/50000 [00:00<?, ?edge/s]

Generating negative examples:   0%|          | 0/50000 [00:00<?, ?example/s]

[(0, 0, 0.0, 0.0, 1518, 1), (0, 0, 0.0, 0.0, 1518, 1), (0, 0, 0.0, 0.0, 1518, 1), (0, 0, 0.0, 0.0, 1518, 1), (0, 0, 0.0, 0.0, 1518, 1)]


In [10]:
columns = ['common_successors', 'common_predecessors', 'jaccard_successors', 'jaccard_predecessors', 'preferential_attachment', 'Label']

# # column_names = [
# #     "common_successors", "common_predecessors", "jaccard_successors", "jaccard_predecessors",
# #     "preferential_attachment", "aa_predecessors", "aa_successors", "ra_predecessors", "ra_successors",
# #     "katz_src", "katz_dest", "pr_src", "pr_dest", "cc_src", "cc_dest", "closeness_src", "closeness_dest",
# #     "betweenness_src", "betweenness_dest", "avg_neighbor_degree_src", "avg_neighbor_degree_dest",
# #     "harmonic_src", "harmonic_dest", "eigenvector_src", "eigenvector_dest", "Label"
# # ]

df = pd.DataFrame(data, columns=columns)
# # shuffle df
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv('pos_neg_samples_v10.csv', index=False)
df.head()

Unnamed: 0,common_successors,common_predecessors,jaccard_successors,jaccard_predecessors,preferential_attachment,Label
0,0,0,0.0,0.0,1,0
1,0,0,0.0,0.0,1,0
2,0,0,0.0,0.0,1,0
3,0,0,0.0,0.0,1,0
4,0,0,0.0,0.0,16,1


In [11]:
# df = pd.read_csv('pos_neg_samples_xgb.csv')
# df.head()

In [12]:
from sklearn.preprocessing import StandardScaler
from train import train_grid_model
from train import train_ensemble_model
# from train import train_model
from train import train_xgboost_model

scaler = StandardScaler()

# log_reg = train_model(df, scaler)
# best_clf = train_grid_model(df, scaler)
# ensemble_model = train_ensemble_model(df, scaler, best_clf)

In [13]:
best_clf = train_grid_model(df, scaler)
ensemble_model = train_ensemble_model(df, scaler, best_clf)

AUC-ROC of Logistic Regression on Validation Set: 0.945385771824451
AUC-ROC of Neural Network on Validation Set: 0.9451816602242162
AUC-ROC of SVM on Validation Set: 0.9452258555707075
AUC-ROC of KNN on Validation Set: 0.9234092095282027
AUC-ROC of Naive Bayes on Validation Set: 0.8911171113581531
AUC-ROC of XGBoost on Validation Set: 0.9454103120168462
AUC-ROC of Ensemble on Validation Set: 0.9453334814144944


In [14]:
# bst_model = train_xgboost_model(df, scaler)

In [15]:
# from graph_sample import preferential_uniform_random_node_sampling

# num_edges_to_sample = 25000
# test_path = 'test.csv'
# G_subgraph = preferential_uniform_random_node_sampling(G, num_edges_to_sample, test_path)

# # check how many nodes in the test set are in the subgraph
# test_df = pd.read_csv('test.csv')
# test_df.head()

# test_df['From'] = test_df['From'].astype(int)
# test_df['To'] = test_df['To'].astype(int)

# test_df['Source_in_subgraph'] = test_df['From'].apply(lambda x: int(x) in G_subgraph.nodes())
# test_df['Sink_in_subgraph'] = test_df['To'].apply(lambda x: int(x) in G_subgraph.nodes())

# print(test_df['Source_in_subgraph'].value_counts())
# print(test_df['Sink_in_subgraph'].value_counts())

# test_df.head()

In [16]:
# import pandas as pd

# test_df = pd.read_csv('test.csv')
# test_nodes = set(test_df['From']).union(set(test_df['To']))

# in_subgraph = {node: node in G_subgraph.nodes() for node in test_nodes}
# counts = {'in_subgraph': 0, 'not_in_subgraph': 0}
# for node, present in in_subgraph.items():
#     if present:
#         counts['in_subgraph'] += 1
#     else:
#         counts['not_in_subgraph'] += 1

# print("Nodes in subgraph:", counts['in_subgraph'])
# print("Nodes not in subgraph:", counts['not_in_subgraph'])

In [17]:
from test import test_model
test_df = pd.read_csv('test.csv')
test_probs, test_features_scaled = test_model(G, test_df, scaler, columns[:-1], ensemble_model)

In [18]:
# test_df = pd.read_csv('test.csv')
# test_features_scaled, test_probs = test_model(G, test_df, scaler, log_model, columns[:-1])

output_df = pd.DataFrame({'Id': range(1, len(test_probs) + 1), 'Predictions': test_probs})
output_df.to_csv('test_predictions_v10.csv', index=False)
output_df.head()

Unnamed: 0,Id,Predictions
0,1,0.778175
1,2,0.982305
2,3,0.944894
3,4,0.847356
4,5,0.778178


In [19]:
output_df['Predictions'].value_counts()

Predictions
0.944842    349
0.611509    208
0.778175    171
0.883032     96
0.944894     62
           ... 
0.944711      1
0.976303      1
0.981897      1
0.780638      1
0.976813      1
Name: count, Length: 1075, dtype: int64