In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

### Parameters

In [2]:
nb_utilisateurs = 2000 # number of users
T = int(100000) # time range of exchanges
anom_percent = 0.01 # proportion of anomalies

### Data generation

In [3]:
utilisateurs = np.arange(nb_utilisateurs)
activity = np.zeros(nb_utilisateurs, dtype = np.int32)
for i in range(nb_utilisateurs):
    activity[i] = int(np.random.exponential(10, size = 1))+1
nb_interactions = np.sum(activity)

# Anomalies d'arêtes
nb_edge_anomalies = int(nb_interactions*anom_percent)

In [4]:
data = np.zeros((nb_utilisateurs + nb_interactions + nb_edge_anomalies, 173))

# Ajout des attributs de noeuds
data[:nb_utilisateurs] = np.hstack((np.full((nb_utilisateurs,1),-1), np.full((nb_utilisateurs,172),1))) #np.random.uniform(-1,1, size = (nb_utilisateurs,172))

times = []
for i in range(nb_utilisateurs):
    times.append(np.random.choice(np.arange(T), size = activity[i]))

time2node = np.empty((T,),dtype=object)
for node in range(nb_utilisateurs):
    for t in times[node]:
        if time2node[t] == None:
            time2node[t] = [node]
        else:
            time2node[t].append(node)

In [5]:
ind_data = nb_utilisateurs
degrees = np.zeros(nb_utilisateurs)

time_edge_anomalies_np = np.random.choice(np.arange(T), size = nb_edge_anomalies)
time_edge_anomalies_set = set(time_edge_anomalies_np)
time_edge_anomalies_dict = {}
for time in time_edge_anomalies_set:
    time_edge_anomalies_dict[time] = len(np.where(time_edge_anomalies_np == time)[0])
edge_anomalies_indices = []

for t in range(T):
    if t in time_edge_anomalies_set:
        src_edge_anom = np.random.choice(utilisateurs, size = (time_edge_anomalies_dict[t],1))
        dest_edge_anom = np.random.choice(utilisateurs[np.where(degrees != degrees[src_edge_anom])[0]], size = (time_edge_anomalies_dict[t],1))
        data[ind_data:ind_data + time_edge_anomalies_dict[t]] = np.hstack((np.array(src_edge_anom,ndmin=2),
                                np.array(dest_edge_anom, ndmin=2), np.full((time_edge_anomalies_dict[t],1),t),
                                np.full((time_edge_anomalies_dict[t],170),0)))
        degrees[src_edge_anom] += 1
        edge_anomalies_indices.extend(np.arange(ind_data-nb_utilisateurs, ind_data-nb_utilisateurs+time_edge_anomalies_dict[t]).tolist())
        ind_data += time_edge_anomalies_dict[t]
    if time2node[t] != None:
        for node in time2node[t]:
            dest = np.random.choice(np.where(degrees==degrees[node])[0], size = 1)
            data[ind_data] = np.hstack((np.array(node,ndmin=2),
                                        np.array(dest, ndmin=2), np.array(t,ndmin=2),
                                        np.full((1,170),0)))
            degrees[node] += 1
            ind_data += 1

### Data saving

In [7]:
pd.DataFrame(data).to_csv(f'./data/Synthetic.csv',index=False)
pd.DataFrame(edge_anomalies_indices).to_csv(f'./data/anom_edges/Synthetic_anom_edges.csv',index=False)