In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

### Dataset selection and anomaly generation through TADDY

In [2]:
# Change the dataset here
data_set = "btc_alpha"

# Here too
%run ../TADDY/0_prepare_data.py --dataset btc_alpha

Generating data with anomaly for Dataset:  btc_alpha
Preprocess dataset: btc_alpha
vertex: 3783  edge:  14124
Preprocess finished! Time: 0.18 s
[#s] generating anomalous dataset...
 2022-08-31 15:18:29.573942
[#s] initial network edge percent: #.1f##, anomaly percent: #.1f##.
 2022-08-31 15:18:29.573942 50.0 1.0




70
70
Anomaly Generation finish! Time: 7.98 s
Train size:7062  7  Test size:7132 7
Training dataset contruction finish! Time: 0.01 s
Test dataset finish constructing! Time: 0.00 s
Generating data with anomaly for Dataset:  btc_alpha
[#s] generating anomalous dataset...
 2022-08-31 15:18:37.628780
[#s] initial network edge percent: #.1f##, anomaly percent: #.1f##.
 2022-08-31 15:18:37.633842 50.0 5.0




353
353
Anomaly Generation finish! Time: 7.91 s
Train size:7062  7  Test size:7415 7
Training dataset contruction finish! Time: 0.01 s
Test dataset finish constructing! Time: 0.00 s
Generating data with anomaly for Dataset:  btc_alpha
[#s] generating anomalous dataset...
 2022-08-31 15:18:45.624318
[#s] initial network edge percent: #.1f##, anomaly percent: #.1f##.
 2022-08-31 15:18:45.624318 50.0 10.0




706
706
Anomaly Generation finish! Time: 7.85 s
Train size:7062  7  Test size:7768 8
Training dataset contruction finish! Time: 0.01 s
Test dataset finish constructing! Time: 0.00 s


In [3]:
data_files = {'uci':'uci',
             'digg':'digg',
             'btc_alpha':'soc-sign-bitcoinalpha',
             'btc_otc':'soc-sign-bitcoinotc',
             'email':'email-dnc',
             'AST':'AST',
             'TGN':'TGN'}
data_file = data_files[data_set]
if data_set in ['digg', 'uci', 'AST']:
    data = np.loadtxt(f"C:/Users/geode/TADDY/data/raw/"+data_file, dtype=float, comments='%', delimiter=' ')
elif data_set in ['btc_alpha', 'btc_otc', 'email', 'TGN']:
    data = pd.read_csv(f"../TADDY/data/raw/{data_file}.csv", sep=',', header=None).to_numpy()

nodes = np.unique(np.concatenate((data.T[0], data.T[1])))

### Functions

In [4]:
def nop_TADDY(data, num_anom_edges):
    data = np.vstack((np.hstack((data[:-num_anom_edges], np.full((data.shape[0]-num_anom_edges, 1), 0))),
                    np.hstack((data[-num_anom_edges:], np.full((num_anom_edges, 1), 1)))))
    
    data = np.array(data).T
    data[2] -= np.min(data[2])
    data = data.T
    ind = np.argsort(data[:,2])
    data = data[ind]
    data = data.T

    indices = np.array([data[0],data[1]]).reshape((1,2*data.shape[1]))
    indices = np.unique(indices)
    dict_indices = {}

    for i in range(len(indices)):
        dict_indices[indices[i]] = i

    for i in range(len(data.T)):
        data[0][i] = dict_indices[data[0][i]]
        data[1][i] = dict_indices[data[1][i]]

    data = data.T

    data_final = np.zeros((data.shape[0]+len(indices), 173))
    data_final[:len(indices)] = np.hstack((np.full((len(indices),1),-1), np.full((len(indices),172),1)))
    data_final[len(indices):] = np.hstack((data.T[:3].T, np.full((data.shape[0],170),-1)))

    data_final = pd.DataFrame(data_final)

    return data_final, np.where(data.T[3]==1)[0].tolist()

In [5]:
def snaps_final_indices(data, data_nop, snap_indices, data_train):
    snaps_times = data[len(data_train)+snap_indices-1].reshape(len(snap_indices), 3)[:, 2:]-min(data[:,2:])#.astype(np.int32)-int(min(data[:,2:]))
    print("The last training instant for this dataset is: ", snaps_times[0][0])
    data_nop = np.array(data_nop)
    start_interactions = np.where(data_nop.T[0] != -1)[0][0]
    data_nop = data_nop[start_interactions:]
    data_nop = data_nop[:,0:3]
    final_indices = []

    for time in snaps_times:
        final_indices.append(np.where(data_nop.T[2] == time)[0][0])
    return final_indices

### Create data for TGN

In [6]:
data_train = pd.read_csv(f"../TADDY/data/for_TGN/train_{data_set}_0.5_0.01.csv", sep=',').to_numpy()
data_test_001 = pd.read_csv(f"../TADDY/data/for_TGN/test_{data_set}_0.5_0.01.csv", sep=',').to_numpy()
snap_indices_001 = pd.read_csv(f"../TADDY/data/for_TGN/{data_set}_0.5_0.01_snap_indices.csv")
data_test_005 = pd.read_csv(f"../TADDY/data/for_TGN/test_{data_set}_0.5_0.05.csv", sep=',').to_numpy()
snap_indices_005 = pd.read_csv(f"../TADDY/data/for_TGN/{data_set}_0.5_0.05_snap_indices.csv")
data_test_01 = pd.read_csv(f"../TADDY/data/for_TGN/test_{data_set}_0.5_0.1.csv", sep=',').to_numpy()
snap_indices_01 = pd.read_csv(f"../TADDY/data/for_TGN/{data_set}_0.5_0.1_snap_indices.csv")
data_times = pd.read_csv(f"../TADDY/data/for_TGN/{data_set}_times.csv", sep=',').to_numpy()
vertexs = pd.read_csv(f"../TADDY/data/for_TGN/{data_set}_vertexs.csv", sep=',').to_numpy()

In [7]:
idx_anom_001 = np.where(data_test_001.T[2] == 1)[0]
idx_anom_005 = np.where(data_test_005.T[2] == 1)[0]
idx_anom_01 = np.where(data_test_01.T[2] == 1)[0]

In [8]:
times_anom_001 = data_times[len(data_train)+idx_anom_001-np.arange(len(idx_anom_001))]
times_anom_005 = data_times[len(data_train)+idx_anom_005-np.arange(len(idx_anom_005))]
tmp_ah = (len(data_train)+idx_anom_01-np.arange(len(idx_anom_01)))
tmp_ah[-2] -= 1
tmp_ah[-1] -= 1
times_anom_01 = data_times[tmp_ah]

#node_max = int(max(max(data_final.T[0]), max(data_final.T[1])))

data_times_with_anom_001 = np.zeros((len(data_times)+len(idx_anom_001), 1))
data_times_with_anom_001[:len(data_train)] = data_times[:len(data_train)]
data_times_with_anom_001[len(data_train)+idx_anom_001] = times_anom_001
data_times_with_anom_001[np.where(data_times_with_anom_001==0)[0]] = data_times[len(data_train):]

data_times_with_anom_005 = np.zeros((len(data_times)+len(idx_anom_005), 1))
data_times_with_anom_005[:len(data_train)] = data_times[:len(data_train)]
data_times_with_anom_005[len(data_train)+idx_anom_005] = times_anom_005
data_times_with_anom_005[np.where(data_times_with_anom_005==0)[0]] = data_times[len(data_train):]

data_times_with_anom_01 = np.zeros((len(data_times)+len(idx_anom_01), 1))
data_times_with_anom_01[:len(data_train)] = data_times[:len(data_train)]
data_times_with_anom_01[len(data_train)+idx_anom_01] = times_anom_01
data_times_with_anom_01[np.where(data_times_with_anom_01==0)[0]] = data_times[len(data_train):]

data_final_001 = np.hstack((np.vstack((data_train, data_test_001[:,0:2])), data_times_with_anom_001))
data_final_005 = np.hstack((np.vstack((data_train, data_test_005[:,0:2])), data_times_with_anom_005))
data_final_01 = np.hstack((np.vstack((data_train, data_test_01[:,0:2])), data_times_with_anom_01))

In [9]:
data_final_001_nop = np.vstack((np.hstack((data[:, 0:2], data[:,3:])),
                    np.hstack((vertexs[data_test_001[idx_anom_001][:, 0:2]].reshape((len(idx_anom_001), 2)), times_anom_001))))
data_final_005_nop = np.vstack((np.hstack((data[:, 0:2], data[:,3:])),
                    np.hstack((vertexs[data_test_005[idx_anom_005][:, 0:2]].reshape((len(idx_anom_005), 2)), times_anom_005))))
data_final_01_nop = np.vstack((np.hstack((data[:, 0:2], data[:,3:])),
                    np.hstack((vertexs[data_test_01[idx_anom_01][:, 0:2]].reshape((len(idx_anom_01), 2)), times_anom_01))))

data_final_001_nop, anom_edges_001_nop = nop_TADDY(data_final_001_nop, len(idx_anom_001))
data_final_005_nop, anom_edges_005_nop = nop_TADDY(data_final_005_nop, len(idx_anom_005))
data_final_01_nop, anom_edges_01_nop = nop_TADDY(data_final_01_nop, len(idx_anom_01))

Path('./data/anom_edges/').mkdir(parents=True, exist_ok=True)
pd.DataFrame(anom_edges_001_nop).to_csv(f'./data/anom_edges/{data_set}_TADDY_001_nop_anom_edges.csv',index=False)
pd.DataFrame(anom_edges_005_nop).to_csv(f'./data/anom_edges/{data_set}_TADDY_005_nop_anom_edges.csv',index=False)
pd.DataFrame(anom_edges_01_nop).to_csv(f'./data/anom_edges/{data_set}_TADDY_01_nop_anom_edges.csv',index=False)

pd.DataFrame(data_final_001_nop).to_csv(f'./data/{data_set}_TADDY_001_nop.csv',index=False)
pd.DataFrame(data_final_005_nop).to_csv(f'./data/{data_set}_TADDY_005_nop.csv',index=False)
pd.DataFrame(data_final_01_nop).to_csv(f'./data/{data_set}_TADDY_01_nop.csv',index=False)

# Indices des snaps de test (pour les données adaptées à TGN)

ind_snaps_001 = snaps_final_indices(data_final_001, data_final_001_nop, snap_indices_001, data_train)
ind_snaps_005 = snaps_final_indices(data_final_005, data_final_005_nop, snap_indices_005, data_train)
ind_snaps_01 = snaps_final_indices(data_final_01, data_final_01_nop, snap_indices_01, data_train)

Path('./data/snaps/').mkdir(parents=True, exist_ok=True)
pd.DataFrame(ind_snaps_001).to_csv(f'./data/snaps/{data_set}_TADDY_001_nop_snaps.csv',index=False)
pd.DataFrame(ind_snaps_005).to_csv(f'./data/snaps/{data_set}_TADDY_005_nop_snaps.csv',index=False)
pd.DataFrame(ind_snaps_01).to_csv(f'./data/snaps/{data_set}_TADDY_01_nop_snaps.csv',index=False)

The last training instant for this dataset is:  57711600.0
The last training instant for this dataset is:  57711600.0
The last training instant for this dataset is:  57711600.0
