In [8]:
!pip install node2vec tensorflow

Collecting node2vec
  Downloading node2vec-0.4.3.tar.gz (4.6 kB)
Building wheels for collected packages: node2vec
  Building wheel for node2vec (setup.py) ... [?25l[?25hdone
  Created wheel for node2vec: filename=node2vec-0.4.3-py3-none-any.whl size=5980 sha256=d0b036812de6a0365c9a91de4dd01b39b0bf35dfc508619a7b3a69c58b7f670e
  Stored in directory: /root/.cache/pip/wheels/07/62/78/5202cb8c03cbf1593b48a8a442fca8ceec2a8c80e22318bae9
Successfully built node2vec
Installing collected packages: node2vec
Successfully installed node2vec-0.4.3


In [141]:
# optionally load copy
!7z x copy.zip -o/content


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan         1 file, 860426 bytes (841 KiB)

Extracting archive: copy.zip
--
Path = copy.zip
Type = zip
Physical Size = 860426

  0%    Everything is Ok

Folders: 3
Files: 8
Size:       3139061
Compressed: 860426


**-------------- Functions --------------**

In [66]:
# choose links from AS file
# samples_number for each link type (peer/customer)
def get_as_data(as_relation_filename, samples_number):
  as_relation_file = open(as_relation_filename, 'r')

  peers = 0
  customers = 0

  X = []
  Y = []

  for line in as_relation_file:
      if line.startswith('#'):
          continue
      data = line.split('|')

      if '0' in data[2]:
          if peers < samples_number:
            X.append([data[0], data[1]])
            Y.append(data[2])
            peers += 1
      else:
          if customers < samples_number:
            X.append([data[0], data[1]])
            Y.append(data[2])
            customers += 1
  return X, Y



import csv

# create dataset for neural network in format:
# [start_node_embedding], [end_node_embedding], [link_type]
def create_dataset(X, Y, X_embed, dataset_filename):

  out = open(dataset_filename, 'w')
  writer = csv.writer(out)

  for i in range(len(X)):
      data = []
      data.extend(X_embed[X[i][0]])
      data.extend(X_embed[X[i][1]])
      if '0' in Y[i]:
          data.append('0')
      else:
          data.append('1')
      
      writer.writerow(data)

  out.close()


# count links
def display_link_stats(Y):

  peers = 0
  customers = 0

  for y in Y:
      if '0' in y:
          peers = peers + 1
      else:
          customers = customers + 1

  print('Peer:', peers, '\n  Customer:', customers)




# embeddings methods ----------------------------------

import networkx as nx
from node2vec import Node2Vec

def node2vec_get_embeddings(as_data):

  embedding_filename = "embeddings_tmp_file"

  # Create a graph 
  graph = nx.Graph()
  for AS in as_data:
      graph.add_edge(AS[0], AS[1])

  # Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
  node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs

  # Embed nodes
  model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

  # Save embeddings for later use
  model.wv.save_word2vec_format(embedding_filename)

  embeddings_file = open(embedding_filename,'r')
  node_embeddings = {}

  i = 0
  for line in embeddings_file:
      if i == 0:
          i = i + 1
          continue
      data = line.split(' ')
      node_embeddings[data[0]] = data[1:]

  return node_embeddings


**-------------- Create datasets --------------**

In [68]:
train_samples = 1000
test_samples = 100

# Condition for as-rank.caida.* input file: (train_samples + test_samples) / 2 <= 580

X, Y = get_as_data('as-rank.caida.peercones-with-IX.txt', (train_samples + test_samples) / 2)

display_link_stats(Y)

X_train = X[0:train_samples]
Y_train = Y[0:train_samples]

X_test = X[train_samples:(train_samples + test_samples)]
Y_test = Y[train_samples:(train_samples + test_samples)]

X_train_embed = node2vec_get_embeddings(X_train)
create_dataset(X_train, Y_train, X_train_embed, 'train_dataset.csv')

X_test_embed = node2vec_get_embeddings(X_test)
create_dataset(X_test, Y_test, X_test_embed, 'test_dataset.csv')

Peer: 550 
  Customer: 550


Computing transition probabilities:   0%|          | 0/751 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/72 [00:00<?, ?it/s]

In [69]:
# evaluate model with KerasClassifier
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
# load dataset
dataframe = read_csv("train_dataset.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:128].astype(float)
Y = dataset[:,128]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
#print(encoded_Y)

def create_model():
	# create model
	model = Sequential()
	model.add(Dense(128, input_dim=128, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

# evaluate model with standardized dataset
estimator = KerasClassifier(build_fn=create_model, epochs=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))



Baseline: 98.90% (0.94%)


In [132]:
# create and train and save model
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle

# load dataset
dataframe = read_csv("train_dataset.csv", header=None)
# dataframe = shuffle(dataframe)
dataset = dataframe.values

# split into input (X) and output (Y) variables
X = dataset[:,0:128].astype(float)
Y = dataset[:,128]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

def create_model():
	# create model
	model = Sequential()
	model.add(Dense(128, input_dim=128, activation='relu'))
	model.add(Dense(64, input_dim=128, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model
# evaluate model with standardized dataset
n_split=5

for train_index,test_index in KFold(n_split).split(X):
  x_train,x_test=X[train_index],X[test_index]
  y_train,y_test=Y[train_index],Y[test_index]
  
  model=create_model()
  model.fit(x_train, y_train, epochs=100, batch_size=10)
  
  print('Model evaluation ',model.evaluate(x_test,y_test))

model.save("model")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [143]:
# evaluate loaded model
from tensorflow import keras
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder

# load dataset
dataframe = read_csv("test_dataset.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:128].astype(float)
Y = dataset[:,128]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

model = keras.models.load_model('model')
predictions = (model.predict(X) > 0.5).astype(int)
# summarize the first 5 cases
success = 0
for i in range(len(X)):
  #print('%s => %d (expected %d)' % (X[i].tolist(), predictions[i], Y[i]))
  if predictions[i] == encoded_Y[i]:
    success += 1

print("Result:", success/len(X)*100)

# model.predict(X)
# predictions = (model.predict(X) > 0.5).astype(int)
# print(predictions)



Result: 100.0


In [139]:
!rm -rf sample_data
!7z a copy.zip /content/


7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive:
  0M Scan  /            9 folders, 25 files, 3768663 bytes (3681 KiB)

Creating archive: copy.zip

Items to compress: 34

  0%     47% 24 + content/test_dataset.csv                                   80% 25 + content/train_dataset.csv                                   
Files read from disk: 25
Archive size: 1457120 bytes (1423 KiB)
Everything is Ok
