# Video network recommendation

In [38]:
import networkx as nx
import numpy as np
import json, random
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

## Data preparaition

In [39]:
from read_vid2vid import *

In [40]:
graph, bvid2index = read_data("../dataset/video2video_bfs.json")
dataset, graph_train = genarate_dataset(graph)


  0%|          | 0/843 [00:00<?, ?it/s][A
100%|██████████| 843/843 [00:00<00:00, 7101.11it/s]Graph with 843 nodes and 1057 edges
generating negative entries


  0%|          | 0/1057 [00:00<?, ?it/s][A
  3%|▎         | 31/1057 [00:00<00:03, 307.75it/s][A3538 negative entries
generating positive entries

  6%|▌         | 63/1057 [00:00<00:03, 309.74it/s][A
  9%|▉         | 95/1057 [00:00<00:03, 311.17it/s][A
 12%|█▏        | 129/1057 [00:00<00:02, 317.75it/s][A
 15%|█▌        | 163/1057 [00:00<00:02, 322.54it/s][A
 19%|█▊        | 197/1057 [00:00<00:02, 325.00it/s][A
 21%|██▏       | 226/1057 [00:00<00:02, 309.93it/s][A
 24%|██▍       | 256/1057 [00:00<00:02, 304.55it/s][A
 27%|██▋       | 285/1057 [00:01<00:03, 229.24it/s][A
 30%|██▉       | 315/1057 [00:01<00:03, 246.27it/s][A
 32%|███▏      | 342/1057 [00:01<00:02, 249.63it/s][A
 35%|███▌      | 372/1057 [00:01<00:02, 262.37it/s][A
 38%|███▊      | 402/1057 [00:01<00:02, 272.00it/s][A
 41%|████      | 435/1057 [00:01<

In [41]:
dataset.head()

Unnamed: 0,node_1,node_2,link
0,0,1,1
1,0,2,1
2,0,3,1
3,0,4,1
4,0,5,1


In [42]:
# dataset = process_parameters_naive(dataset, graph_train)
dataset = process_parameters_node2vec(dataset, graph_train)


Computing transition probabilities: 100%|██████████| 843/843 [00:00<00:00, 11270.09it/s]

Generating walks (CPU: 1):   0%|          | 0/25 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   8%|▊         | 2/25 [00:00<00:05,  4.58it/s][A
Generating walks (CPU: 1):  12%|█▏        | 3/25 [00:00<00:06,  3.54it/s][A
Generating walks (CPU: 1):  16%|█▌        | 4/25 [00:01<00:06,  3.06it/s][A
Generating walks (CPU: 1):  20%|██        | 5/25 [00:01<00:07,  2.80it/s][A
Generating walks (CPU: 1):  24%|██▍       | 6/25 [00:02<00:07,  2.64it/s][A
Generating walks (CPU: 1):  28%|██▊       | 7/25 [00:02<00:07,  2.53it/s][A
Generating walks (CPU: 1):  32%|███▏      | 8/25 [00:03<00:06,  2.47it/s][A
Generating walks (CPU: 1):  36%|███▌      | 9/25 [00:03<00:06,  2.42it/s][A
Generating walks (CPU: 1):  40%|████      | 10/25 [00:03<00:06,  2.39it/s][A
Generating walks (CPU: 1):  44%|████▍     | 11/25 [00:04<00:05,  2.34it/s][A
Generating walks (CPU: 1):  48%|████▊     | 12/25 [00:04<00:05,  2.3

In [43]:
dataset.head()

Unnamed: 0,node_1,node_2,link,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,...,dim_15,dim_16,dim_17,dim_18,dim_19,dim_20,dim_21,dim_22,dim_23,dim_24
0,0,1,1,3.677244,-7.408587,-1.149522,4.76693,2.232504,-2.800908,0.520753,...,-1.768633,-1.95066,1.571567,7.28432,8.317077,-6.887998,-1.237405,-1.572495,0.12386,1.296767
1,0,2,1,3.913168,-6.052948,-2.48416,7.417711,1.989914,-0.896275,-2.294354,...,-2.95892,-3.61506,3.166021,6.184459,7.896777,-5.650343,-1.060356,-2.884728,1.600661,2.806098
2,0,3,1,7.552043,-8.007521,-1.464558,5.554373,1.379922,1.401353,1.301248,...,-0.26052,-4.308901,1.884464,8.559779,4.687583,-2.751374,-2.081569,-2.889452,-0.879296,1.081219
3,0,4,1,3.489395,-5.346329,-3.440903,4.715023,0.297392,1.677482,-2.671068,...,-4.806254,-3.278936,-0.571291,8.300882,7.263016,-7.203069,-1.967599,-2.745599,-0.282734,-2.361041
4,0,5,1,3.682384,-6.835263,-4.939224,3.149331,0.627273,3.658145,-1.089675,...,-1.456812,-3.889001,0.963955,7.514911,5.606872,-7.628733,-2.69541,-2.488801,1.290942,0.902572


## Train model

In [44]:
from sklearn.model_selection import train_test_split
# simple models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

In [45]:
predictors = np.array(dataset.iloc[:, 3:])
response = dataset['link']
xtrain, xtest, ytrain, ytest = train_test_split(predictors, response,
                                                test_size = 0.3, 
                                                random_state = 114514)

In [46]:
judge = lambda x, y: 0 if x > y else 1

### logistic classification

In [47]:
# logistic regression
lr = LogisticRegression(class_weight="balanced")
lr.fit(xtrain, ytrain)
predictions = lr.predict_proba(xtest)
roc_auc_score(ytest, predictions[:,1])

0.88063440779251

In [48]:
ypred = [judge(predictions[i,0], predictions[i,1]) for i in range(predictions.shape[0])]
print(accuracy_score(ytest, ypred))
confusion_matrix(ytest, ypred)

0.8


array([[827, 224],
       [ 14, 125]], dtype=int64)