In [1]:
import pandas as pd
import numpy as np
import csv
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
import networkx as nx
import requests
import csv
from io import StringIO

def get_directed_graph(url):
    response = requests.get(url)
    content = response.text
    G = nx.DiGraph()

    csv_reader = csv.reader(StringIO(content))
    count = 1
    for row_data in csv_reader:
        if count % 1000 == 0:
            print(count)
        node = int(row_data[0])
        for neighbor in row_data[1:]:
            G.add_edge(node, int(neighbor))
        count +=1

    return G


def get_undirected_graph(url):
    response = requests.get(url)
    content = response.text
    G = nx.Graph()

    csv_reader = csv.reader(StringIO(content))
    count = 1
    for row_data in csv_reader:
        if count % 1000 == 0:
            print(count)
        node = int(row_data[0])
        for neighbor in row_data[1:]:
            G.add_edge(node, int(neighbor))
        count +=1

    return G

# Example usage:
url = 'https://media.githubusercontent.com/media/NeerajSura/PML/main/LinkPrediction/train.csv'
undirected_graph = get_undirected_graph(url)
print("Number of nodes:", undirected_graph.number_of_nodes())
print("Number of edges:", undirected_graph.number_of_edges())



1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
Number of nodes: 4867136
Number of edges: 23415200


In [3]:
from google.colab import files
files.upload()

{}

In [4]:
positive_samples=np.load("positive_samples.npy").tolist()
negative_samples=np.load("negative_samples.npy").tolist()

def get_test_samples():
    test_data = pd.read_csv('test.csv')
    columns_to_convert = ['From', 'To']
    test_samples = test_data[columns_to_convert].values
    return test_samples.tolist()
test_samples = get_test_samples()
UG = undirected_graph

In [8]:
def generate_positive_features():
    features = []
    count = 0
    print("Generating positive features......")
    for sample in positive_samples:
        if (count % 1000 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            feature.append(1)  # label=1

        except Exception as e:
            print("one error at: "+str(count))
            print(e)
            pass
        features.append(feature)
    print("positive features: "+str(len(features)))
    return features


def generate_negative_features():
    features = []
    count = 0
    print("Generating negative features......")
    for sample in negative_samples:
        if (count % 1000 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            feature.append(0)  # label=0
        except Exception as e:
            print("one error at: "+str(count))
            print(e)
            pass
        features.append(feature)

    print("negative features: "+str(len(features)))
    return features

def generate_test_features():
    features = []
    count = 0
    print("Generating test features......")
    for sample in test_samples:
        if (count % 1000 == 0):
            print(count)
        count += 1
        feature = []
        try:
            preds = nx.resource_allocation_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.jaccard_coefficient(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.adamic_adar_index(UG, [sample])
            for u, v, p in preds:
                feature.append(p)

            preds = nx.preferential_attachment(UG, [sample])
            for u, v, p in preds:
                feature.append(p)
        except Exception as e:
            print("one error at: "+str(len(count)))
            print(e)
            pass
        features.append(feature)

    return features

# add features and label, combine
def generate_traning_data():
    positive_features = generate_positive_features()
    negative_features = generate_negative_features()
    features = positive_features + negative_features
    # random.shuffle(features)
    return features


traning_data = generate_traning_data()

def write_train_to_csv(traning_data):
    with open("train.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["RA","JC","AA","PA","Label"])
        writer.writerows(traning_data)

write_train_to_csv(traning_data)

def write_test_to_csv(test_data):
    with open("test.csv","w",newline="") as csvfile:
        writer=csv.writer(csvfile)
        writer.writerow(["RA","JC","AA","PA"])
        writer.writerows(test_data)

test_data=generate_test_features()
write_test_to_csv(test_data)



Generating positive features......
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
positive features: 9783
Generating negative features......
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
negative features: 9995
Generating test features......
0
1000


In [10]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('train.csv')
FEATURE_SIZE=4

X = dataset.iloc[:,0:FEATURE_SIZE].values
y = dataset.iloc[:, FEATURE_SIZE].values

In [12]:
dataset

Unnamed: 0,RA,JC,AA,PA,Label
0,0.011652,0.028571,0.453945,1760,1
1,0.000000,0.000000,0.000000,78,1
2,2.022425,0.126685,22.050308,129355,1
3,0.001422,0.005747,0.152547,5400,1
4,0.000000,0.000000,0.000000,384,1
...,...,...,...,...,...
19773,0.000000,0.000000,0.000000,77,0
19774,0.000000,0.000000,0.000000,45,0
19775,0.000000,0.000000,0.000000,614,0
19776,0.000002,0.000912,0.077517,1096,0


In [14]:
X.shape

(19778, 4)

In [16]:
y.shape

(19778,)

In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [19]:
from sklearn import linear_model
lr=linear_model.LogisticRegression(C=1e4,penalty='l2')
lr.fit(x_train,y_train)

In [20]:
from sklearn.metrics import roc_auc_score
pre=lr.predict_proba(x_test)
# print(pre)
y_pre=[p[1] for p in pre]
acc=lr.score(x_test,y_test)
print(acc)
auc=roc_auc_score(y_test,y_pre)
print(auc)

0.8427704752275025
0.8774667182969174


In [21]:
testdata = pd.read_csv('test.csv')
x_testing = testdata.iloc[:,0:FEATURE_SIZE].values
x_testing = sc.transform(x_testing)
predictions=lr.predict_proba(x_testing)

In [23]:
import csv
with open("Predictions.csv","w",newline="") as csvfile:
    writer=csv.writer(csvfile)
    writer.writerow(["Id","Predictions"])
    test_id=1
    for prediction in predictions:
        writer.writerow([test_id,prediction[1]])
        test_id+=1