## Unión datasets monitores + generación fingerprint

### Carga y unión de datasets

In [24]:
# Cargamos los datasets procesados de los monitores
import pandas as pd

dir = '../0_Datasets/'

# Cargamos los datasets de los monitores (ES, DE, IE, SE, PL, IT)
df_1_ES = pd.read_csv(dir + 'Clean-Datasets/learnData_monit1_ES_clean.csv')
df_2_DE = pd.read_csv(dir + 'Clean-Datasets/learnData_monit2_DE_clean.csv')
df_3_IE = pd.read_csv(dir + 'Clean-Datasets/learnData_monit3_IE_clean.csv')
df_4_SE = pd.read_csv(dir + 'Clean-Datasets/learnData_monit4_SE_clean.csv')
df_5_PL = pd.read_csv(dir + 'Clean-Datasets/learnData_monit5_PL_clean.csv')
df_6_IT = pd.read_csv(dir + 'Clean-Datasets/learnData_monit6_IT_clean.csv')

# Tamaño de los datasets
print('Tamaño de los datasets:')
print('ES:', df_1_ES.shape)
print('DE:', df_2_DE.shape)
print('IE:', df_3_IE.shape)
print('SE:', df_4_SE.shape)
print('PL:', df_5_PL.shape)
print('IT:', df_6_IT.shape)

Tamaño de los datasets:
ES: (127501, 16)
DE: (128641, 16)
IE: (128524, 16)
SE: (127198, 16)
PL: (126663, 16)
IT: (126657, 16)


In [25]:
# Juntamos los datasets de los monitores
df = pd.concat([df_1_ES, df_2_DE, df_3_IE, df_4_SE, df_5_PL, df_6_IT], ignore_index=True)

# Tamaño del dataset final
print('Tamaño del dataset final:', df.shape)
df.head()

Tamaño del dataset final: (765184, 16)


Unnamed: 0,measure_id,monitor_id,landmark_id,landmark_type,dst_ip,ping_rtt1,ping_rtt2,ping_rtt3,ping_rtt4,ping_rtt5,ping_rtt_avg,ping_rtt_std,direct_distance_kms,init_time,4h_time_slot,6h_time_slot
0,95.217.19.69_2024-11-27 13:31:04,monitor_1_Madrid_ES,9404,ripe_anchor,95.217.19.69,58.439,58.898,58.747,58.634,58.173,58.5782,0.25197,2963.40386,2024-11-27 13:31:04,12,12
1,45.13.104.153_2024-11-27 13:31:11,monitor_1_Madrid_ES,9042,ripe_probe,45.13.104.153,31.799,31.801,31.797,31.882,31.886,31.833,0.04168,912.535931,2024-11-27 13:31:11,12,12
2,212.216.126.87_2024-11-27 13:32:12,monitor_1_Madrid_ES,11265,ripe_probe,212.216.126.87,39.699,39.044,39.275,39.292,38.981,39.2582,0.25237,1341.885074,2024-11-27 13:32:12,12,12
3,31.21.219.185_2024-11-27 13:32:28,monitor_1_Madrid_ES,13743,ripe_probe,31.21.219.185,38.281,38.71,38.269,38.244,38.24,38.3488,0.18125,1413.980155,2024-11-27 13:32:28,12,12
4,92.117.103.72_2024-11-27 13:33:06,monitor_1_Madrid_ES,19667,ripe_probe,92.117.103.72,41.44,-1.0,-1.0,-1.0,-1.0,41.44,16.976,1410.985506,2024-11-27 13:33:06,12,12


In [26]:
# Nos quedamos con las entradas que tengan repetido monitor_id 6 veces
print(f'Número de entradas antes de filtrar: {df.shape[0]}')
df_6 = df[df['measure_id'].map(df['measure_id'].value_counts()) == 6]
print(f'Número de entradas después de filtrar: {df_6.shape[0]}')

Número de entradas antes de filtrar: 765184
Número de entradas después de filtrar: 697056


### Generación dataset fingerprint

In [27]:
# Cargamos dataset ripe Anchor/Probes para extraer info geo
df_ripe = pd.read_csv(dir + 'Learning-Datasets/new_anchors_probes_europe_learning.csv')

# Imprimimos la cabecera del dataset
print('Tamaño del dataset ripe:', df_ripe.shape)
df_ripe.head()

Tamaño del dataset ripe: (6482, 6)


Unnamed: 0,id,ip_v4,country,latitude,longitude,type
0,278,213.225.160.239,FR,48.583148,7.747882,anchor
1,282,145.220.0.55,NL,52.370216,4.895168,anchor
2,291,192.65.184.54,FR,46.232473,6.045897,anchor
3,303,213.190.96.41,IS,64.139664,-21.955246,anchor
4,314,185.42.136.158,SE,59.32893,18.06491,anchor


In [28]:
# Comprobamos que no haya duplicados en el dataset ripe
print('Número de entradas antes de filtrar:', df_ripe.shape[0])
df_ripe = df_ripe.drop_duplicates(subset=['ip_v4'])
print('Número de entradas después de filtrar:', df_ripe.shape[0])

Número de entradas antes de filtrar: 6482
Número de entradas después de filtrar: 6314


In [29]:
# Renombramos la columna ip_v4 a dst_ip para poder hacer el merge
df_ripe.rename(columns={'ip_v4': 'dst_ip'}, inplace=True)

In [30]:
# Hacemos el merge de los datasets
print('Tamaño del dataset final antes del merge:', df_6.shape)
df_final = pd.merge(df_6, df_ripe[['dst_ip', 'latitude', 'longitude', 'country']], on='dst_ip', how='left')
print('Tamaño del dataset final después del merge:', df_final.shape)

Tamaño del dataset final antes del merge: (697056, 16)
Tamaño del dataset final después del merge: (697056, 19)


In [31]:
# Codificamos la columna monitor_id
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_final['monitor_id'] = le.fit_transform(df_final['monitor_id'])
print(f'Clases de la columna monitor_id: {le.classes_}')
df_final.head()

Clases de la columna monitor_id: ['monitor_1_Madrid_ES' 'monitor_2_Frankfurt am Main_DE'
 'monitor_3_Dublin_IE' 'monitor_4_Gävle_SE' 'monitor_5_Warsaw_PL'
 'monitor_6_Milan_IT']


Unnamed: 0,measure_id,monitor_id,landmark_id,landmark_type,dst_ip,ping_rtt1,ping_rtt2,ping_rtt3,ping_rtt4,ping_rtt5,ping_rtt_avg,ping_rtt_std,direct_distance_kms,init_time,4h_time_slot,6h_time_slot,latitude,longitude,country
0,95.217.19.69_2024-11-27 13:31:04,0,9404,ripe_anchor,95.217.19.69,58.439,58.898,58.747,58.634,58.173,58.5782,0.25197,2963.40386,2024-11-27 13:31:04,12,12,60.34167,25.027666,FI
1,45.13.104.153_2024-11-27 13:31:11,0,9042,ripe_probe,45.13.104.153,31.799,31.801,31.797,31.882,31.886,31.833,0.04168,912.535931,2024-11-27 13:31:11,12,12,45.7615,4.8395,FR
2,212.216.126.87_2024-11-27 13:32:12,0,11265,ripe_probe,212.216.126.87,39.699,39.044,39.275,39.292,38.981,39.2582,0.25237,1341.885074,2024-11-27 13:32:12,12,12,44.8215,11.5995,IT
3,31.21.219.185_2024-11-27 13:32:28,0,13743,ripe_probe,31.21.219.185,38.281,38.71,38.269,38.244,38.24,38.3488,0.18125,1413.980155,2024-11-27 13:32:28,12,12,51.4505,5.4505,NL
4,194.1.149.210_2024-11-27 13:33:37,0,9117,ripe_anchor,194.1.149.210,47.252,47.505,47.279,47.019,47.005,47.212,0.18549,1974.085485,2024-11-27 13:33:37,12,12,41.324873,19.816192,AL


In [32]:
# Generación del dataset fingerprint
# Por cada measure_id, generamos el dataset fingerprint
from tqdm import tqdm

# Creamos un dataframe vacío con las columnas que vamos a necesitar
fingerprint = pd.DataFrame(columns=[
    'measure_id', 
    'landmark_id', 
    'dst_ip', 
    'init_time', 
    'latency_m1', 
    'latency_m2', 
    'latency_m3', 
    'latency_m4', 
    'latency_m5', 
    'latency_m6', 
    'latitude', 
    'longitude', 
    'gps_coord', 
    'country_code',
    '4h_time_slot',
    '6h_time_slot'
])

# Obtenemos los measure_id únicos
measure_ids = df_final['measure_id'].unique()

# Iteramos sobre los measure_id
for measure_id in tqdm(measure_ids, desc="Generando fingerprint"):
    # Obtenemos las entradas asociadas a ese measure_id
    measure_data = df_final[df_final['measure_id'] == measure_id]
    # print(measure_data)

    # Obtenemos las latencias además del id del monitor
    latencies = measure_data[['monitor_id', 'ping_rtt_avg']].values
    # print(latencies)
    
    # Obtenemos las coordenadas GPS
    gps = measure_data[['latitude', 'longitude']].values[0]
    latitudes = measure_data['latitude'].values[0]
    longitudes = measure_data['longitude'].values[0]

    # Obtenemos el country_code
    country_code = measure_data['country'].values[0]
    
    # Obtenemos el landmark_id, dst_ip y init_time
    landmark_id = measure_data['landmark_id'].values[0]
    dst_ip = measure_data['dst_ip'].values[0]
    init_time = measure_data['init_time'].values[0]

    # Obtenemos las franjas horarias
    time_slot_4h = measure_data['4h_time_slot'].values[0]
    time_slot_6h = measure_data['6h_time_slot'].values[0]
    
    # Añadimos una nueva entrada al dataframe fingerprint
    fingerprint_id = pd.DataFrame(
        {
            'measure_id': [measure_id],
            'landmark_id': [landmark_id],
            'dst_ip': [dst_ip],
            'init_time': [init_time],
            '4h_time_slot': [time_slot_4h],
            '6h_time_slot': [time_slot_6h],
            'latency_m1': [latencies[latencies[:, 0] == 0, 1][0]],
            'latency_m2': [latencies[latencies[:, 0] == 1, 1][0]],
            'latency_m3': [latencies[latencies[:, 0] == 2, 1][0]],
            'latency_m4': [latencies[latencies[:, 0] == 3, 1][0]],
            'latency_m5': [latencies[latencies[:, 0] == 4, 1][0]],
            'latency_m6': [latencies[latencies[:, 0] == 5, 1][0]],
            'latitude': [latitudes],
            'longitude': [longitudes],
            'gps_coord': [gps],
            'country_code': [country_code]
        },
        index=[0]
    )
    fingerprint = pd.concat([fingerprint, fingerprint_id], ignore_index=True)

# Mostramos las primeras filas del dataframe fingerprint
print(f'Size of fingerprint: {fingerprint.shape}')
fingerprint.head()

  fingerprint = pd.concat([fingerprint, fingerprint_id], ignore_index=True)
Generando fingerprint: 100%|██████████| 116176/116176 [1:22:40<00:00, 23.42it/s]

Size of fingerprint: (116176, 16)





Unnamed: 0,measure_id,landmark_id,dst_ip,init_time,latency_m1,latency_m2,latency_m3,latency_m4,latency_m5,latency_m6,latitude,longitude,gps_coord,country_code,4h_time_slot,6h_time_slot
0,95.217.19.69_2024-11-27 13:31:04,9404,95.217.19.69,2024-11-27 13:31:04,58.5782,29.9146,44.508,30.9156,43.3216,44.2758,60.34167,25.027666,"[60.3416703, 25.0276663]",FI,12,12
1,45.13.104.153_2024-11-27 13:31:11,9042,45.13.104.153,2024-11-27 13:31:11,31.833,24.006,31.605,40.16,41.6678,33.8332,45.7615,4.8395,"[45.7615, 4.8395]",FR,12,12
2,212.216.126.87_2024-11-27 13:32:12,11265,212.216.126.87,2024-11-27 13:32:12,39.2582,24.6598,40.5294,45.7428,38.677,37.7004,44.8215,11.5995,"[44.8215, 11.5995]",IT,12,12
3,31.21.219.185_2024-11-27 13:32:28,13743,31.21.219.185,2024-11-27 13:32:28,38.3488,16.2102,24.1406,33.097,33.7898,27.4372,51.4505,5.4505,"[51.4505, 5.4505]",NL,12,12
4,194.1.149.210_2024-11-27 13:33:37,9117,194.1.149.210,2024-11-27 13:33:37,47.212,29.2842,50.24,56.8978,62.3142,54.1528,41.324873,19.816192,"[41.324873, 19.816192]",AL,12,12


In [33]:
# Guardamos el dataset fingerprint
fingerprint.to_csv(dir + 'Fingerprint-Datasets/fingerprint.csv', index=False)