In [2]:
from ipwhois import IPWhois
import whois
import ipaddress
import pandas as pd
import numpy as np
import os 
import csv

def is_valid_ip(i):
    try:
        ipaddress.ip_address(i)
        if i.startswith('192.168.') or i.startswith('224.') or i.startswith('239.') or i.startswith('169.254') or i.startswith('multicast'):
            return False
        return True
    except ValueError:
        return False

def is_local(ip):

    mac_file = '../devices.txt'
    mac_dic = {}
    with open(mac_file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            tmp_mac, tmp_device = line[:-1].split(' ')
            if len(tmp_mac) != 17:
                mac_split = tmp_mac.split(':')
                for i in range(len(mac_split)):
                    if len(mac_split[i]) != 2:
                        mac_split[i]='0'+mac_split[i]
                tmp_mac = ':'.join(mac_split)
            mac_dic[tmp_device] = tmp_mac

    if ip.startswith('192.168.') or ip.startswith('224.') or ip.startswith('239.') or ip.startswith('22:ef:03:1a:97:b9') or ip.startswith('multicast'):
        return True
    if ip in mac_dic:
        return True
    return False

def get_organization_ip(ip):
    ip_whois = IPWhois(ip)
    result = ip_whois.lookup_rdap()
    # print(result.keys())
    organizations = set()
    
    
    if 'objects' in result:
        objects = result['objects']
        
        for object in objects:
            if object in objects:
                if 'contact' in objects[object]:
                    if 'organization' in objects[object]['contact']:
                        organizations.add(objects[object]['contact']['organization'])
                    elif 'name' in objects[object]['contact']:
                        organizations.add(objects[object]['contact']['name'])
                else:
                    organizations.add(object)
        print(organizations)
        return list(organizations)

    if 'entities' in result:
        entities = result['entities']
        
        # print(entities)
        print(entities)
        for entity in entities:
            # if 'roles' in entity and 'registrant' in entity['roles']:
            if 'contact' in entity:
                if 'organization' in entity['contact']:
                    organizations.add(entity['contact']['organization'])
                elif 'name' in entity['contact']:
                    organizations.add(entity['contact']['name'])
            else:
                organizations.add(entity)
        return list(organizations)

    return None



def get_organization(domain):
    w = whois.whois(domain)
    # print(w)
    if w and 'org' in w:
        organization = w['org']
        # print(organization)
        if isinstance(organization, str):
            return [organization]
        elif isinstance(organization, list):
            if 'Data Protected' in organization or 'Domains By Proxy' in organization or 'REDACTED' in organization or 'Not Disclosed' in organization:
                return None
            return organization

    return None


def read_input(file):
    data = pd.read_csv(file)
    hosts = np.array(data['hosts'].fillna('').values)
    return hosts


def read_first_party(file):
    device_first_party = {}
    
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if line:
                device = line.split()[0]
                first_party = line.split()[1:]
                device_first_party[device] = first_party
    return device_first_party


def process_single(input_string, device_first_party):
    

    # print('---', input_string)
    if is_valid_ip(input_string):
        organization = get_organization_ip(input_string)
    else:  
        input_string = '.'.join(input_string.split('.')[-2:])
        organization = get_organization(input_string)
    # print('First org: ', organization)
    if organization:
        pass
        # print(f"Organization for {input_string}: {organization}")
    else:
        # organization = input_string
        if input_string in ['amcs-tachyon.com', 'cloudfront.net']:
            organization = ['Amazon Technologies, Inc.']
        else:
            organization = ['.'.join(input_string.split('.')[-2:])]
        # print(f"Organization not found for {input_string}")
    # print('Second org: ', organization)
    party = 0
    support_party_list = ['aws', 'cloudflare' , 'amazon', 'org', 'neu.edu', 'aka','digicert']
    if not isinstance(organization, list):
        print('error:', organization, input_string)
    for first_party in device_first_party:
        first_party = first_party.strip().lower()
        for org in organization:
            if first_party in org.lower():
                party = 1
                organization = org
                break
        if party != 1:
            for org in organization:
                for s in support_party_list:
                    if s in org.lower():
                        party = 2
                        organization = org
                        break
        if party == 0:
            party = 3
            organization = organization[0]
    # print('Party: ', party)
    return organization, party

first_party_list = 'first_party_list.txt'


In [None]:
# read periodic domains from freq_period

device_first_party = read_first_party(first_party_list)

idle_fingerprint_domain_dir = '../event_inference/period_detection/freq_period/ingerprints'
out_dir = 'periodic_data/destination_party'
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

header = ['protocol', 'domain', 'period', 'org', 'party']
device_dic = {}
for dev_file in os.listdir(idle_fingerprint_domain_dir):
    if dev_file.endswith('.txt'):
        device_name = dev_file.split('.')[0]

        print(device_name)
        device_dic[device_name] = dev_file
        tmp_output = []

        with open(os.path.join(idle_fingerprint_domain_dir, dev_file), 'r') as f:
            lines = f.readlines()

            for line in lines:
                if not line:
                    continue
                prot = line.split()[0]
                domain = line.split()[1]
                period = line.split()[2]
                if prot == 'DHCP' or is_local(domain):
                    continue 
                organization, party = process_single(domain, device_first_party[device_name] )
                tmp_output.append([prot, domain, period, organization, party])
                
        with open(os.path.join(out_dir, '%s.csv' % device_name), 'w') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerows(tmp_output)

In [None]:
# read activity fingerprints domains
fingerprint_dir = '../event_inference/data/fingerprints'
out_dir = 'activity_data/destination_party'
if not os.path.exists(out_dir):
    os.system('mkdir -pv %s' % out_dir)

header = ['protocol', 'domain', 'activity', 'org', 'party']
device_dic = {}
device_first_party = read_first_party(first_party_list)
for dev_file in os.listdir(fingerprint_dir):
    if dev_file.endswith('.txt'):
        device_name = dev_file.split('.')[0]
        # if device_name != 'echoplus':
        #     continue
        print(device_name)

        tmp_output = []
        activity_fingerprint_dic = {}
        activity_fingerprint_merge_count = {}

        with open(os.path.join(fingerprint_dir, dev_file), 'r') as f:
            lines = f.readlines()


            tmp_activity_list = []
            tmp_hostname_set = set()
            
            for line in lines:
                if not line:
                    continue
                line = line[:-1]
                if line.startswith('fingerprint'):
                    # print(line)
                    tmp_activity_list.append(line.split(':')[0].split('- ')[1])
                    activity = line.split(':')[0].split('- ')[1]
                    tmp_hostname = line.split(': ')[1].split(';')
                    

                    for tmp in tmp_hostname:
                        if tmp in tmp_hostname_set or tmp == '':
                            continue
                        tmp_hostname_set.add(tmp)
                        prot = tmp.split(',')[1]
                        domain = tmp.split(',')[0]
                        # period = line.split()[2]
                        # if prot == 'DHCP' or is_local(domain):
                        #     continue 
                        organization, party = process_single(domain, device_first_party[device_name] )
                        tmp_output.append([prot, domain, activity, organization, party])
                
        with open(os.path.join(out_dir, '%s.csv' % device_name), 'w') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerows(tmp_output)
            




In [5]:
# read aperiodic domains
aperiodic_dir = 'aperiodic_domains'
aperiodic_dir = 'aperiodic_domains/routine'
out_dir = 'aperiodic_data/destination_party_routine'
if not os.path.exists(out_dir):
    os.system('mkdir -pv %s' % out_dir)

header = ['domain', 'count', 'org', 'party']
device_dic = {}
device_first_party = read_first_party(first_party_list)
for dev_file in os.listdir(aperiodic_dir):
    if dev_file.endswith('.csv'):
        device_name = dev_file.split('.')[0]

        print(device_name)

        tmp_output = []
        data = pd.read_csv(os.path.join(aperiodic_dir, dev_file))

        for index, row in data.iterrows():
            domain = row['Domain']
            count = row['Flow']
            organization, party = process_single(domain, device_first_party[device_name] )
            tmp_output.append([ domain, count, organization, party])
                
        with open(os.path.join(out_dir, '%s.csv' % device_name), 'w') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerows(tmp_output)
            




switchbot-hub
amazon-plug
ring-camera
meross-dooropener
govee-led1
gosund-bulb1
echospot
tplink-plug
bulb1
magichome-strip
nest-tstat
tplink-bulb
wyze-cam
smartlife-bulb
dlink-camera
ikettle
ring-doorbell
t-wemo-plug


In [None]:
import pandas as pd
import numpy as np
cols_feat = [ "meanBytes", "minBytes", "maxBytes", "medAbsDev",
             "skewLength", "kurtosisLength", "meanTBP", "varTBP", "medianTBP", "kurtosisTBP",
             "skewTBP", "network_total", "network_in", "network_out", "network_external", "network_local",
            "network_in_local", "network_out_local", "meanBytes_out_external",
            "meanBytes_in_external", "meanBytes_out_local", "meanBytes_in_local", 
            "device", "state", "event", "start_time", "remote_ip", "remote_port" ,"trans_protocol", "raw_protocol", "protocol", "hosts"]

test_file = '/home/ubuntu/Behaviot/event_inference/imc_data/idle-2021-features/meross-dooropener.csv'

test_data = pd.read_csv(test_file)

hosts = np.array(test_data['hosts'].fillna('').values)
protocol = np.array(test_data['protocol'].fillna('').values)
remote_ip = np.array(test_data['remote_ip'].fillna('').values)
remote_port = np.array(test_data['remote_port'].fillna('').values)
trans_protocol = np.array(test_data['trans_protocol'].fillna('').values)
raw_protocol = np.array(test_data['raw_protocol'].fillna('').values)

filtered_df = test_data[["remote_ip", "remote_port", "trans_protocol", "raw_protocol", "protocol", "hosts"]]
grouped_df = filtered_df.groupby(["hosts", "raw_protocol", 'remote_port']).size().reset_index(name="count")
grouped_host = filtered_df.groupby(["hosts"]).size().reset_index(name="count")
print(grouped_host)
