<font size = 10>This notebook is a tool to Build a graph for spatio-temporal anomaly detection in 5G networks </font>

`First: we need those import`

In [1]:
import os
import random
import numpy as np
import pandas as pd
from timeit import default_timer as timer
import time
import re
import Tools_first_step as tfs
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None

The threashold is required to select when we want an edge or not using this equation:

$$   
    A_{i,j}= 
    \left\{
    \begin{array}{ll}
    0 & \text{if  ~ } S_{i,j} \leq T \\
    1 & \text{if ~} S_{i,j} > T 
    \end{array}
    \right.
     $$

The L is used in a Lambda model in the equation: $$  S_{i,j}=\lambda |Corr(x_i,x_j)| + (1-\lambda)|Cosim(x_i, x_j)| $$ (ref. in the article XXX)

In [2]:
threashold=0.90
L=0.5

This is all the path of file we need in this notebook

In [None]:
path_dataset='/dataset_end_total_preprocessed.csv'
doc_path="truc.txt"
matrix_correlation_path='corr_matrice.csv' #not mandatory, we can also compute directly

> ## I) Load the dataset

In [None]:
df=pd.read_csv(path_dataset, nrows=2) 
#nrows can be used or not
#depending if we need to calculate the correlation matrix or import it
columns=df.columns

In [None]:
#verify the dataset :
print(df.shape)
col_to_keep=tfs.rmv_Unnamed(df.columns)
df=df[col_to_keep]
print(df.shape)

# II) This part will generate a graph with our data

> ### For this, we need to tokenize the feature name

In [None]:
name_columns=df.columns
print(tfs.estimate_total_number_of_word(name_columns))
number_complet=tfs.estimate_total_number_of_word(name_columns)
print(tfs.find_frequence_of_words(name_columns)[1])

In [None]:
tfs.split_all(name_columns)

In [8]:
dico_all_word=tfs.find_frequence_of_words(name_columns)[1]
dico2=tfs.find_frequence_of_words(name_columns)[0]

In [9]:
import nltk

new_dico_all_word=tfs.cleaning_txt_documentation(doc_path)

In [10]:
#we need to get every word, including those that are only in the dataset, so we just add the column name at the end of the description file to be sure to have it
final_dictionnary=new_dico_all_word+tfs.split_all(name_columns)

> ### Using word2vec, we transform our token, then features names into vectors

Instead, we will probably have to try BERT with the time ! BERT seams better today, but we have to give context for this one

In [None]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec

# Load pretrained model (download if needed: GoogleNews-vectors-negative300.bin)
pretrained_model = KeyedVectors.load_word2vec_format('/GoogleNews-vectors-negative300.bin', binary=True)

model = Word2Vec(vector_size=300, min_count=1)
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=False)
model.wv.vectors = pretrained_model.vectors

In [None]:
model.build_vocab(final_dictionnary, update=True)

In [None]:
model.train(final_dictionnary, total_examples=len(final_dictionnary), epochs=10)

In [None]:
a=tfs.number_a_word_in_doc(name_columns, dico2, dico_all_word)
print(a)

In [None]:
score_for_each=[]
for word, freq in dico2.items():
    a=tfs.TF_new_version(word, dico2, number_complet)
    score_for_each.append(a)

plt.plot(score_for_each)

> ### Now we need to use a similarity score to evaluate the difference between two features names

In [None]:

sequence1 = model.wv['scaling']
sequence2 = model.wv['collector']
similarity = tfs.cosine_similarity(sequence1, sequence2)
print("Cosine Similarity:", similarity)

In [None]:
a,b=tfs.score_similarity_current(name_columns, model)
print(b)

In [None]:
print(a[('node_cpu_scaling_frequency_hertz{cpu="0"}_server_1','node_cpu_scaling_frequency_hertz{cpu="5"}_server_1')])#'process_open_fds_server_6')])#)])

In [None]:
nodes=name_columns

#cosine_similarities, vectores_places=tfs.score_similarity_current(name_columns, model_light)
cosine_similarities_25d, vectores_places_25d=tfs.score_similarity_current(name_columns, model)


> ### Once this is done, we can plot the first graph

In [19]:
#tfs.plot_graph_v1(cosine_similarities,threashold,vectores_places)

In [20]:
#xe,ye,ze, all_edges=tfs.get_edges_v3(cosine_similarities,threashold, vectores_places, "server")
#print(all_edges)

In [None]:
#import Tools_first_step as tfs
xe,ye,ze, all_edges=tfs.get_edges_topK(cosine_similarities_25d,threashold, vectores_places_25d, "server", topK=100)

In [22]:
#print(all_edges[0][0][0])

In [23]:
#medium_node_number=tfs.get_medium_number_of_edges(all_edges)
#print(medium_node_number)

In [24]:
#nb=tfs.get_distribution_number_of_edges_per_nodes("server",cosine_similarities,threashold, vectores_places)

> ### We can even plot part of the graph that contains a specific word (in the dictionnary of word in all the features)

In [25]:
#tfs.plot_only_part_data(cosine_similarities,threashold,vectores_places,"server_1")

# III) This part will propose to modify the edges selections, by using also the correlation matrix

In [26]:
#corr_mat=df.corr()

#Here we decide to load the correlation instead of compute it

corr_mat=np.genfromtxt(matrix_correlation_path, delimiter=',')
corr_mat=pd.DataFrame(corr_mat, columns=df.columns)
for i in range(corr_mat.shape[0]):
    for j in range(i+1, corr_mat.shape[1]):
        corr_mat.iloc[j,i]=corr_mat.iloc[i,j]

np.fill_diagonal(corr_mat.values, 1)

In [None]:
corr_mat.index=df.columns
corr_mat.head()

In [28]:
dico_correlation=tfs.change_into_dico(corr_mat)

In [29]:
#tfs.print_corr_mat(corr_mat)

In [None]:
print(len(dico_correlation))
print(len(cosine_similarities_25d))
print(len(corr_mat.iloc[1]))

> ### Final score:

The function "Build_new_score_product" produce the score given by the equation: 

$$ S_{i,j}= |Corr(x_i,x_j)|*|Cosim(x_i, x_j)| $$

The If we prefer the Lambda model from the equation:

$$  S_{i,j}=\lambda |Corr(x_i,x_j)| + (1-\lambda)|Cosim(x_i, x_j)|$$

We can use here instead "build_score_lambda"

If we do not want to use a TopK function, we can use get_edges_v3  or simply select a k high enough

In [None]:
#My_dictionary=tfs.build_new_score_product(dico_correlation, cosine_similarities) #no plot in >3D warning
My_dictionary_25=tfs.build_new_score_product(dico_correlation, cosine_similarities_25d)
#my_edges_test= tfs.edges_dico(My_dictionary, threashold)
my_edges_test_25=tfs.edges_dico(My_dictionary_25, threashold)
#useless1,useless2,useless2, edges_complet=tfs.get_edges_topK(My_dictionary,threashold,vectores_places,"server", 20)
useless4,useless5,useless6, edges_complet_25=tfs.get_edges_v3(My_dictionary_25,threashold,vectores_places_25d,"server") 

In [None]:
#tfs.get_distribution_number_of_edges_per_nodes("cpu",My_dictionary, threashold, vectores_places)

In [None]:
new_distrib=tfs.get_distribution_number_of_edges_per_nodes("server",My_dictionary_25,threashold, vectores_places_25d)

In [None]:
tfs.plot_only_part_data(My_dictionary_25,threashold,vectores_places_25d,"server_1")

In [None]:
useless1,useless2,useless2, edges_complet=tfs.get_edges_v3(My_dictionary_25,threashold,vectores_places_25d,"cpu")
med2=tfs.get_medium_number_of_edges(edges_complet)
print(med2)

# IV) This section build the adjacency matrix that we willl use in GNN

In [37]:
adj_25=tfs.build_adjacency_matrix_v2(edges_complet_25[1:],corr_mat)

In [None]:
adj_25np=np.array(adj_25)

In [None]:
#visualize the matrix:
downsampled_data = adj_25np[::1, ::1]  # we can downsample if we prefer

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(downsampled_data, cmap='viridis')
plt.title('Adjacency matrix 25D')
plt.show()

> ### We made 2 differents models for words, one words projected in 25 dimensions, the other in 3, now we compare the two matrix

# V) Finaly we have to save/load the matrix

Modify the path for saving in the right place

In [None]:
np.savetxt('/adj25d_th09.csv',adj_25np, delimiter=',')

In [None]:
#Here is how to load after
t1=np.genfromtxt('/adj25d_th09.csv', delimiter=',')