# Чтение

In [2]:
# MAX_ID = 20000 for poss_set_small_2_dist.txt
# MAX_ID = 10000 for poss_set_small.txt
# MAX_ID = -1 for poss_set.txt
MAX_ID = 20000

In [3]:
fin_train = open('train.in', 'r')
train_set = set()
nodes = set()
for line in fin_train:
    line = line.strip()
    u, w = line.split()
    if MAX_ID == -1 or (int(u) < MAX_ID and int(w) < MAX_ID):
        train_set.add(line)
        nodes.add(int(u))
        nodes.add(int(w))
fin_train.close()

In [4]:
from graph_tool.all import *

In [7]:
g = Graph(directed=False)

In [8]:
g.add_vertex(max(nodes) + 1)

<generator object <genexpr> at 0x7fe65c05b910>

In [9]:
for edge in train_set:
    u, w = map(int, edge.split())
    g.add_edge(g.vertex(u), g.vertex(w))

In [10]:
comp, hist = label_components(g)

In [11]:
pos = sfdp_layout(g)

In [None]:
%matplotlib inline

graph_draw(g, pos=pos)

In [9]:
!ls

2006.png		      neg_set.txt
Clean DBLP.ipynb	      network.png
Clean DBLP-small.ipynb	      poss_set_small_2_dist.txt
dataset_DBLP_2010-2015.zip    poss_set_small.txt
dblp.xml		      poss_set.txt
Disbalance comp analys.ipynb  Predict DBPL.ipynb
Disbalance dist analys.ipynb  Predict DBPL-Small.ipynb
disbalance.pdf		      test.out
Draw.ipynb		      test.xml
forcesalgs.png		      train.in
MF.png			      train.xml
neg_set_small_2_dist.txt      Untitled.ipynb
neg_set_small.txt


# Read pos and neg

In [12]:
fin_poss = open('poss_set_small_2_dist.txt', 'r')
fin_neg = open('neg_set_small_2_dist.txt', 'r')

poss_set = set()
neg_set = set()

for line in fin_poss:
    line = line.strip()
    u, w = line.split()
    if MAX_ID == -1 or (int(u) < MAX_ID and int(w) < MAX_ID):
        poss_set.add(line)
fin_poss.close()

for line in fin_neg:
    line = line.strip()
    u, w = line.split()
    if MAX_ID == -1 or (int(u) < MAX_ID and int(w) < MAX_ID):
        neg_set.add(line)
fin_neg.close()

In [13]:
print len(poss_set), len(neg_set)

1000 1000


# Предсказание

In [14]:
import numpy    
def dist(u, w):
    #if comp[g.vertex(u)] != comp[g.vertex(w)]:
    #    return 100000000
    
    u = pos[g.vertex(u)]
    w = pos[g.vertex(w)]
    return ((u[0] - w[0])**2 + (u[1] - w[1])**2)**0.5

def preferential_attachment(u, w):
    return g.vertex(u).out_degree()*g.vertex(w).out_degree()

def common_neighbors(u, w):
    return len(set.intersection(
        set(g.vertex(u).out_neighbours()), 
        set(g.vertex(w).out_neighbours())))

def union_neighbors(u, w):
    return len(
        set(g.vertex(u).out_neighbours()) | set(g.vertex(w).out_neighbours()))
            
def Jaccards_coefficient(u, w):
    return 1.0 * common_neighbors(u, w) / union_neighbors(u, w)

In [15]:
def make_dataset(poss_set, neg_set, functs):
    X = []
    Y = []
    for line in poss_set:
        u, w = map(int, line.split())
        x = []
        for func in functs:
            x.append(func(u, w))
        X.append(x)
        Y.append(1)
    for line in neg_set:
        u, w = map(int, line.split())
        x = []
        for func in functs:
            x.append(func(u, w))
        X.append(x)
        Y.append(0)
    X = numpy.array(X)
    Y = numpy.array(Y)
    return X, Y

In [16]:
X, Y = make_dataset(poss_set, neg_set, 
                    [dist
                    ])

In [17]:
X[:5], Y[:5]

(array([[ 23.37474625],
        [ 33.77224531],
        [ 14.20098442],
        [ 23.55706783],
        [ 59.20489768]]), array([1, 1, 1, 1, 1]))

In [18]:
X.shape, Y.shape

((2000, 1), (2000,))

In [19]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y, 1 - X)

0.62897900000000062

In [19]:
from  sklearn.ensemble import RandomForestClassifier as RF

In [260]:
from sklearn.model_selection import cross_val_score

In [261]:
clf = RF()

In [262]:
scores = cross_val_score(clf, X, Y, cv=5, scoring='roc_auc')

In [263]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.89 (+/- 0.01)


# Предсказание через MF

In [20]:
from scipy.sparse import coo_matrix
def make_sparse_matrix(train_set, n):
    row = []
    col = []
    data = []
    for line in train_set:
        u, w = map(int, line.split())
        row.append(u)
        col.append(w)
        row.append(w)
        col.append(u)
        data.append(1)
        data.append(1)
    return coo_matrix((data, (row, col)), shape=(n, n))

In [21]:
G = make_sparse_matrix(train_set, MAX_ID)

In [22]:
from sklearn.decomposition import NMF

In [23]:
model = NMF(n_components=10, init='random', random_state=0)

In [24]:
A = model.fit_transform(G)

In [25]:
A.shape

(20000, 10)

In [26]:
def mf_score(u, w):
    return numpy.dot(A[u], A[w])

In [27]:
X, Y = make_dataset(poss_set, neg_set, [mf_score])

In [28]:
from sklearn.metrics import roc_auc_score
roc_auc_score(Y, X.reshape((X.shape[0],)))

0.60446999999999984

In [95]:
scores = cross_val_score(clf, X, Y, cv=5, scoring='roc_auc')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

NameError: name 'cross_val_score' is not defined