In [1]:
import pandas as pd 
import numpy as np
from collections import defaultdict
from clickhouse_driver import Client
from pathlib import Path
import os.path
df = pd.read_csv("3_weeks.csv")
client = Client(host='',database="february2024",user="",password="",
client_name="python-driver from avisoiu")
client.execute("SET max_query_size = 10000000000")

def get_overlaps(ip_list):
    result = client.execute("""
        WITH IpTraffic AS (
            SELECT SrcIP as src_ip,
                   DstIP as dst_ip,
                   DstPort as dst_port,
                   Timestamp as time,
                   any(Timestamp) OVER (PARTITION BY DstIP, DstPort, toDate(Timestamp) ORDER BY Timestamp ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) as time_prev_hit,
                   any(SrcIP) OVER (PARTITION BY DstIP, DstPort, toDate(Timestamp) ORDER BY Timestamp ASC ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) as ip_prev_hit,
                   if(toDate(time_prev_hit) = '1970-01-01', 11000, age('s', time_prev_hit, time)) as time_diff -- 10800s = 3hrs
            FROM tcppackets
            WHERE SrcIP IN %(ips)s
        ), OverlapCounter AS (
            SELECT dst_ip, dst_port, toDate(time) as date, COUNT(DISTINCT src_ip) as overlap_count
            FROM IpTraffic
            WHERE time_diff < 10800 AND ip_prev_hit != src_ip
            GROUP BY dst_ip, dst_port, date
            HAVING COUNT(DISTINCT src_ip) > 1)
        SELECT a.distinct_dest_ips as distinct_dest_ips,
               if(isNaN(b.overlaps), 0, b.overlaps) as overlaps,
               if(isNaN(b.avg_overlap), 0, b.avg_overlap) as avg_overlap
        FROM (SELECT COUNT(DISTINCT dst_ip) AS distinct_dest_ips from IpTraffic) as a, (SELECT COUNT(DISTINCT dst_ip)/COUNT(DISTINCT date) as overlaps, avg(overlap_count) as avg_overlap FROM OverlapCounter) as b
        """,
            { 'ips': tuple(ip_list)});
    distinct_dst_ips = result[0][0]
    overlaps = result[0][1]
    return (distinct_dst_ips, overlaps)

def extract_no_overlaps(group, group_df):
    ns = []
    new_groups = []
    ips = group_df['src_ip'].tolist()
    print(f"Group {group}. IPs: {ips}")
    v = [False for i in range(len(ips))]
    for idx in range(len(ips)):
       if v[idx]:
         continue 
       v[idx] = True 
       group = [ips[idx]]
       last_hit_ips, last_overlaps = 0, 0
       for nxtIdx in range(idx+1, len(ips)):
           if v[nxtIdx]:
               continue
           #print(f"Next ip: {ips[nxtIdx]}")
           group.append(ips[nxtIdx])
           hit_ips, overlaps = get_overlaps(group)
           if overlaps > 0:
               group.pop() 
           else:
               last_hit_ips, last_overlaps = hit_ips, overlaps
               v[nxtIdx] = True

       if len(group) == 1:
           ns.append(ips[idx])
       else:
           if last_hit_ips < 32000:
               print(f"Found group, but coverage is not enough.")
           else:
               new_groups.append((group, last_hit_ips))
               print(f"Found group with {last_hit_ips} coverage")
    return (new_groups, ns)

def process_overlap(method_name):
    labels = pd.read_csv(f"results/{method_name}/labels.csv")
    df['label'] = labels['0']
    heavy_hitters = pd.read_csv(f"analysis/{method_name}/heavy_hitters_stats.csv")
    overlaps = heavy_hitters[heavy_hitters['overlaps'] > 0].sort_values(by="overlaps", ascending=False)
    group_ids = overlaps['group'].unique().tolist()
    new_partial_covers = 0
    new_noise = 0
    for group in group_ids:
        try:
            noise_file = pd.read_csv(f"analysis/{method_name}/post/group_{group}_noise.csv")
            print(f"Group {group} already processed.")
        except:
            group_df = df[df['label'] == group]
            groups, ns = extract_no_overlaps(group, group_df)
            print(f"Groups extracted: {groups}")
            print(f"Number of groups: {len(groups)}")
            new_partial_covers += len(groups)
            new_noise += len(ns)
            for index, new_group in enumerate(groups):
                df[df['src_ip'].isin(new_group[0])].to_csv(f"analysis/{method_name}/post/group_{group}_{index}_{new_group[1]}.csv")
            df[df['src_ip'].isin(ns)].to_csv(f"analysis/{method_name}/post/group_{group}_noise.csv")
            print(f"Total new partial covers found until now: {new_partial_covers}")
            print(f"Total new noise: {new_noise}")
        


In [2]:
process_overlap("hdb_fs2")

Group 3175 already processed.
Group 3181 already processed.
Group 3025 already processed.
Group 3182 already processed.
Group 38 already processed.
Group 86 already processed.
Group 3584 already processed.
Group 204 already processed.
Group 119 already processed.
Group 2797 already processed.
Group 2800 already processed.
Group 3561 already processed.
Group 2677 already processed.
Group 536 already processed.
Group 126 already processed.
Group 21031 already processed.
Group 596 already processed.
Group 918 already processed.
Group 202 already processed.
Group 551 already processed.
Group 595 already processed.
Group 206 already processed.
Group 205 already processed.
Group 1738 already processed.
Group 486 already processed.
Group 981 already processed.
Group 3006 already processed.
Group 21132 already processed.
Group 1829 already processed.
Group 944 already processed.
Group 42031 already processed.
Group 3372 already processed.
Group 933 already processed.
Group 3008 already process