In [2]:
import networkx as nx
import csv
import numpy as np
from random import randint
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle


In [3]:
G=nx.read_edgelist('datasets/edgelist.txt', delimiter=',',create_using=nx.Graph(),nodetype=int)
nodes=list(G.nodes())
n=G.number_of_nodes()
m=G.number_of_edges()

print('Number of nodes:', n)

print('Number of edges:', m)



Number of nodes: 138499
Number of edges: 1091955


In [4]:
abstracts=dict()
with open('datasets/abstracts.txt', 'r',encoding="utf-8") as f:
    for line in f:
        node,abstract=line.split('|--|')
        abstracts[int(node)]=abstract

In [5]:
for node in abstracts:
    abstracts[node]=set(abstracts[node].split())

In [6]:
X_train=np.zeros((2*m,3))
y_train=np.zeros(2*m)
n=G.number_of_nodes()


In [7]:
#i==a counter starting from zero,
#edge==the edge associated with that node
#one iteration returns G.edges(0)
#i=0, edge=0,1
#i=1, edge=0,2

for i,edge in enumerate(G.edges(0)):
    print(i)
    print(edge)

0
(0, 1)
1
(0, 2)


In [8]:
X_train=np.zeros((2*m,3))
y_train=np.zeros(2*m)
n=G.number_of_nodes()



In [16]:
edge=(0,1)
print(abstracts[edge[0]])

{'and', 'luminaire,', 'This', 'record', 'grey', 'methodology', 'then', 'assessment', 'Experiments', 'descent', 'normal', 'a', 'are', 'placed', 'pattern', 'aircraft', 'match', 'luminous', 'level', 'brightness', 'calculate', 'during', 'aerodrome', 'was', 'quality', 'cockpit', 'value', 'used', 'As', 'optimum', 'The', 'on', 'determined.', 'metric', 'compared', 'standards', 'system.', 'inside', 'the', 'to', 'model-based', 'along', 'ground', 'pixel', 'luminaire', 'with', 'be', 'is', 'between', 'ascertain', 'data', 'intensity', 'such,', 'development', 'images', 'demonstrate', 'each', 'associated', 'sensor,', 'position', 'for', 'real', 'image', 'aerodrome.', 'composed', 'estimate', 'instant', 'system', 'expected', 'recommendations,', 'ensure', 'orientation', 'camera', 'actual', 'presented', 'imaged', 'order', 'of', 'required', 'operating', 'lighting', 'A', 'accordance', 'AGL', 'it', 'luminaire.', 'effectiveness', 'an', 'presented.', 'that', 'in', '(AGL),', 'application', 'automated', 'standard

In [18]:
#abstracts[edge[0]] == the set of words in the abstract of 
#node A
#abstracts[edge[1]]==the set of words in the abstract
#of node B
#nodes A,B are connected through edge edge

for i,edge in enumerate(G.edges()):
    X_train[i,0]=len(abstracts[edge[0]])+len(abstracts[edge[1]])
    X_train[i,1]=len(abstracts[edge[1]])-len(abstracts[edge[1]])
    X_train[i,2]=len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    y_train[i]=1

    #randomly generate a pair of nodes
    #we assume that these connections do not
    # exist in the graph
    X_train[m+i,0]=len(abstracts[edge[0]])+len(abstracts[edge[1]])
    X_train[m+i,1]=len(abstracts[edge[0]])-len(abstracts[edge[1]])
    X_train[m+i,2]=len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    y_train[m+i]=0

In [19]:
print('size of training matrix=',X_train.shape)

size of training matrix= (2183910, 3)


In [33]:
node_pairs = list()
with open('datasets/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        #the raw split form is ['12223', '345332\n']
        #we need to make them integers
        #use the function int()
        node_pairs.append((int(t[0]), int(t[1])))

In [36]:
#np.zeros needs the shape of the matrix as a tuple
#create the test matrix with the same features as the training matrix

X_test=np.zeros((len(node_pairs),3))
for (i,edge) in enumerate(node_pairs):
    X_test[i,0]=len(abstracts[edge[0]])+len(abstracts[edge[1]])
    X_test[i,1]=len(abstracts[edge[0]])-len(abstracts[edge[1]])
    X_test[i,2]=len(abstracts[edge[0]].intersection(abstracts[edge[1]]))

print("size of test matrix is: ", X_test.shape)


size of test matrix is:  (106692, 3)


In [38]:
#train our model
#why use shuffle though?

X_train,y_train=shuffle(X_train,y_train)

In [51]:
classifier=LogisticRegression(solver='liblinear',random_state=34)
classifier.fit(X_train, y_train)

y_pred=classifier.predict_proba(X_test)
#predict_proba returns an array with the probs of 
#0 and 1 eg[0.34, 0.66]
#we keep only the probabilities of there existing a node
y_pred=y_pred[:,1]


In [53]:
predictions=zip(range(len(y_pred)),y_pred)
with open("datasets/submission_text_baseline.csv", 'w') as pred:
    csv_out=csv.writer(pred)
    csv_out.writerow(['id','predicted'])

    for row in predictions:
        csv_out.writerow(row)
