In [25]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as stats
import matplotlib.pyplot as plt
import matplotlib as mpl

import spamhaus

# Data loading
In the next few cells we load in the data we will need for the research.
For the Spamhaus ZEN database we don't have an actual database file, but 
we have an url that we can that we can query for responses on IPs

In [43]:
iot_honeypot_files = [ "./data/cmds_sequence_2016-07-01.csv", "./data/cmds_sequence_2016-07-02.csv", "./data/cmds_sequence_2016-07-03.csv", "./data/cmds_sequence_2016-07-04.csv", 
    "./data/cmds_sequence_2016-07-05.csv", "./data/cmds_sequence_2016-07-06.csv", "./data/cmds_sequence_2016-07-07.csv", "./data/cmds_sequence_2016-07-08.csv", 
    "./data/cmds_sequence_2016-07-09.csv", "./data/cmds_sequence_2016-07-10.csv", "./data/cmds_sequence_2016-07-11.csv", "./data/cmds_sequence_2016-07-12.csv", 
    "./data/cmds_sequence_2016-07-13.csv", "./data/cmds_sequence_2016-07-14.csv", "./data/cmds_sequence_2016-07-15.csv", "./data/cmds_sequence_2016-07-16.csv", 
    "./data/cmds_sequence_2016-07-17.csv", "./data/cmds_sequence_2016-07-18.csv", "./data/cmds_sequence_2016-07-19.csv", "./data/cmds_sequence_2016-07-20.csv", 
    "./data/cmds_sequence_2016-07-21.csv", "./data/cmds_sequence_2016-07-22.csv", "./data/cmds_sequence_2016-07-23.csv", "./data/cmds_sequence_2016-07-24.csv", 
    "./data/cmds_sequence_2016-07-25.csv", "./data/cmds_sequence_2016-07-26.csv", "./data/cmds_sequence_2016-07-27.csv", "./data/cmds_sequence_2016-07-28.csv", 
    "./data/cmds_sequence_2016-07-29.csv", "./data/cmds_sequence_2016-07-30.csv", "./data/cmds_sequence_2016-07-31.csv", "./data/cmds_sequence_2016-08-29.csv", 
    "./data/cmds_sequence_2016-08-30.csv", "./data/cmds_sequence_2016-08-31.csv", "./data/cmds_sequence_2016-09-01.csv", "./data/cmds_sequence_2016-09-02.csv", 
    "./data/cmds_sequence_2016-09-03.csv", "./data/cmds_sequence_2016-09-04.csv", "./data/cmds_sequence_2016-09-05.csv", "./data/cmds_sequence_2016-09-06.csv", 
    "./data/cmds_sequence_2016-09-07.csv", "./data/cmds_sequence_2016-09-08.csv", "./data/cmds_sequence_2016-09-09.csv", "./data/cmds_sequence_2016-09-10.csv", 
    "./data/cmds_sequence_2016-09-11.csv", "./data/cmds_sequence_2016-09-12.csv", "./data/cmds_sequence_2016-09-13.csv"
]

sp = spamhaus.SpamhausChecker()
spamhaus_cache = dict()

phishtank_file = "./data/phishtank-verified.csv"
malwaredomainlist_file = "./data/mdl_blacklist.txt"
malc0de_file = "./data/malc0de_blacklist.txt"

def IP_to_int(IP: str) -> int:
    parts = IP.split(".")
    assert len(parts) == 4
    return (int(parts[0]) << 24) + (int(parts[1]) << 16) + (int(parts[2]) << 8) + (int(parts[3]) << 0)

def int_to_IP(IP: int) -> str:
    part_0 = IP >> 24 & 0xff
    part_1 = IP >> 16 & 0xff
    part_2 = IP >>  8 & 0xff
    part_3 = IP >>  0 & 0xff
    return f"{part_0}.{part_1}.{part_2}.{part_3}"

def matching_values(a: pd.Series, b: pd.Series) -> int:
    values_a = set(a)
    values_b = set(b)
    return len(values_a.intersection(values_b))

def read_iot_honeypot(path: str) -> pd.Series:
    df = pd.read_csv(path, 
                   sep="\$\$",
                   engine="python",
                   header=None, 
                   names=["Timestamp", "Src IP", "Src Port", "Dest IP", "Dest Port", "Commandlist"], 
                   index_col=False)
    IPs = df['Src IP'].apply(IP_to_int)
    return IPs

def spamhaus_single(IP: int) -> dict:
    if IP not in spamhaus_cache:
        spamhaus_cache[IP] = sp.check_status(int_to_IP(IP))
    return spamhaus_cache[IP]
        

def check_spamhaus_ZEN(IPs: [int]) -> list:
    # Filter out records where status = 0, or the issue is that the IP is dynamic and you
    # Should not send mail to it.
    return list(filter(lambda x: x['status'] == '1' and x['assessment'] != 'Dynamic IP', map(spamhaus_single, IPs)))

def read_phishtank() -> pd.Series:
    df = pd.read_csv(phishtank_file, 
                   header=0, 
                   index_col=False)
    df['ip'] = df['ip'].apply(IP_to_int)
    df['submission_time'] = df['submission_time'].apply(pd.Timestamp)
    
    t = pd.Timestamp("2016-06-01T00:00:00+00:00")
    return df[df['submission_time'] > t]['ip']

def read_malc0de() -> pd.Series:
    df_malc0de = pd.read_csv(malc0de_file, header=None, names=['ip'], index_col=False)
    df_malc0de['ip'] = df_malc0de['ip'].apply(IP_to_int)
    return df_malc0de['ip']

def read_malwaredomainlist() -> pd.Series:
    df_mdl = pd.read_csv(malwaredomainlist_file, header=None, names=['ip'], index_col=False)
    df_mdl['ip'] = df_mdl['ip'].apply(IP_to_int)
    return df_mdl['ip']
    

In [3]:
hp_series = read_iot_honeypot(iot_honeypot_files[0])
print(f"Entries in {iot_honeypot_files[0]}:", len(hp_series))
hp_series.head()

Entries in ./data/cmds_sequence_2016-07-01.csv: 54857


0    1940982743
1    3699832094
2    3662531234
3     244448040
4    3717874886
Name: Src IP, dtype: int64

In [4]:
check_spamhaus_ZEN(hp_series.head().to_list())

# sp.check_status("109.237.214.112")


[{'status': '1',
  'response_code': 'IP ranges which should not be delivering unauthenticated SMTP email',
  'assessment': 'Dynamic IP',
  'url': 'http://www.spamhaus.org/query/bl?ip=218.77.202.162'},
 {'status': '1',
  'response_code': 'IP ranges which should not be delivering unauthenticated SMTP email',
  'assessment': 'Dynamic IP',
  'url': 'http://www.spamhaus.org/query/bl?ip=14.145.251.40'}]

In [5]:
phish_series = read_phishtank()
print("Entries in phishtank db:", len(phish_series))
phish_series.head()

Entries in phishtank db: 503869


0     400837049
1    1160868646
2    1160868646
3    2809291215
4    2809291215
Name: ip, dtype: int64

In [44]:
malc0de_series = read_malc0de()
print("Entries in malc0de db:", len(malc0de_series))

mdl_series = read_malwaredomainlist()
print("Entries in malwaredomainlist db:", len(mdl_series))

malc0de_mdl_series = malc0de_series.append(mdl_series, ignore_index=True)
print("Entries in malc0de + malwaredomainlist db:", len(malc0de_mdl_series))
malc0de_mdl_series.head()

Entries in malc0de db: 2345
Entries in malwaredomainlist db: 997
Entries in malc0de + malwaredomainlist db: 3342


0    3237931954
1    1581880140
2    3340646881
3    3233389945
4    3571709938
Name: ip, dtype: int64

# Statistics
Now our data is loaded in nicely, we can start calculating our statistics. To account for IP turnover in our IoT honeypot data set, we look at the percentage of IPs used for other crimes on a daily basis, instead of looking
for the whole dataset at once.

To calculate our statistics, we use the ... test.

In [13]:
print("===================================================================")
print("")

# Loop through all honeypot files
for hp_fp in iot_honeypot_files:
    # Read the honeypot file, and filter out recurring IPs
    print(f"{hp_fp}:")
    hp_series = read_iot_honeypot(hp_fp).unique()
    
    # SPAM
    ZEN_results = check_spamhaus_ZEN(list(hp_series))
    print("\tRaw number of IPs in Spamhaus ZEN:", len(ZEN_results))
    print("\tpercentage of IPs in Spamhaus ZEN: {:.3f}%".format(100 * len(ZEN_results) / len(hp_series)))
    
    # Phishing
    phishtank_results = matching_values(hp_series, phish_series)
    print("\tRaw number of IPs in Phishtank db:", phishtank_results)
    print("\tpercentage of IPs in Phishtank db: {:.3f}%".format(100 * phishtank_results / len(hp_series)))
    
    # Malware
    malware_results = matching_values(hp_series, malc0de_mdl_series)
    print("\tRaw number of IPs in Malc0de + Malwaredomainlist db:", malware_results)
    print("\tpercentage of IPs in Malc0de + Malwaredomainlist db: {:.3f}%".format(100 * malware_results / len(hp_series)))
    
    print("")
    print("===================================================================")
    print("")
    


./data/cmds_sequence_2016-07-01.csv:
	Raw number of IPs in Spamhaus ZEN: 411
	percentage of IPs in Spamhaus ZEN: 1.585%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-02.csv:
	Raw number of IPs in Spamhaus ZEN: 517
	percentage of IPs in Spamhaus ZEN: 1.838%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-03.csv:
	Raw number of IPs in Spamhaus ZEN: 433
	percentage of IPs in Spamhaus ZEN: 1.671%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-04.csv:
	R

	Raw number of IPs in Spamhaus ZEN: 248
	percentage of IPs in Spamhaus ZEN: 1.872%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-23.csv:
	Raw number of IPs in Spamhaus ZEN: 249
	percentage of IPs in Spamhaus ZEN: 1.811%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-24.csv:
	Raw number of IPs in Spamhaus ZEN: 277
	percentage of IPs in Spamhaus ZEN: 1.799%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-07-25.csv:
	Raw number of IPs in Spamhaus ZEN: 270


	Raw number of IPs in Spamhaus ZEN: 291
	percentage of IPs in Spamhaus ZEN: 1.695%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-09-11.csv:
	Raw number of IPs in Spamhaus ZEN: 6
	percentage of IPs in Spamhaus ZEN: 1.489%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-09-12.csv:
	Raw number of IPs in Spamhaus ZEN: 173
	percentage of IPs in Spamhaus ZEN: 1.490%
	Raw number of IPs in Phishtank db: 0
	percentage of IPs in Phishtank db: 0.000%
	Raw number of IPs in Malc0de + Malwaredomainlist db: 0
	percentage of IPs in Malc0de + Malwaredomainlist db: 0.000%


./data/cmds_sequence_2016-09-13.csv:
	Raw number of IPs in Spamhaus ZEN: 152
	p

In [69]:
# Take the human readable results from the cell above and put them in a numpy array
results = np.array([
    [ 411 , 0.0159 ],
    [ 517 , 0.0184 ],
    [ 433 , 0.0167 ],
    [ 410 , 0.0153 ],
    [ 383 , 0.0162 ],
    [ 434 , 0.0197 ],
    [ 256 , 0.0173 ],
    [ 250 , 0.0176 ],
    [ 195 , 0.0156 ],
    [ 241 , 0.0173 ],
    [ 269 , 0.0167 ],
    [ 281 , 0.0152 ],
    [ 329 , 0.0149 ],
    [ 324 , 0.0155 ],
    [ 339 , 0.0163 ],
    [ 346 , 0.0172 ],
    [ 326 , 0.0198 ],
    [ 211 , 0.0170 ],
    [ 167 , 0.0180 ],
    [ 225 , 0.0171 ],
    [ 248 , 0.0187 ],
    [ 249 , 0.0181 ],
    [ 277 , 0.0180 ],
    [ 270 , 0.0190 ],
    [ 310 , 0.0197 ],
    [ 348 , 0.0203 ],
    [ 144 , 0.0170 ],
    [ 325 , 0.0190 ],
    [ 120 , 0.0168 ],
    [ 136 , 0.0163 ],
    [ 871 , 0.0221 ],
    [ 836 , 0.0208 ],
    [ 656 , 0.0213 ],
    [ 807 , 0.0225 ],
    [ 678 , 0.0214 ],
    [ 419 , 0.0221 ],
    [ 579 , 0.0202 ],
    [ 358 , 0.0198 ],
    [ 240 , 0.0169 ],
    [ 200 , 0.0158 ],
    [ 201 , 0.0164 ],
    [ 243 , 0.0163 ],
    [ 291 , 0.0170 ],
    [   6 , 0.0149 ],
    [ 173 , 0.0149 ],
    [ 152 , 0.0153 ],
])
np.mean(results, axis=0), np.std(results, axis=0)

(array([3.36608696e+02, 1.77891304e-02]),
 array([1.85893317e+02, 2.11498055e-03]))

In [70]:
n = 40158
p = 0.005
mean = n * p
stdev = np.sqrt(n * p * (1 - p))
mean, stdev, p * (1-p)

(200.79, 14.134569324885707, 0.004975)

In [71]:
r_0 = np.random.normal(mean, stdev, len(results))
r_0 = r_0 * np.sign(r_0)
print("Value T-test: {} (p={:.3E})".format(*stats.ttest_rel(r_0, results[:,0])))

r_1 = np.random.normal(p, p*(1-p), len(results))
r_1 = r_1 * np.sign(r_1)
print("Percentage T-test: {} (p={:.3E})".format(*stats.ttest_rel(r_1, results[:,1])))

Value T-test: -4.887715624152065 (p=1.336E-05)
Percentage T-test: -16.45594470535933 (p=1.162E-20)


In [72]:
r_0 = np.random.normal(mean, stdev, len(results))
r_0 = r_0 * np.sign(r_0)
print("Value Mann Whitney U: {} (p={:.3E})".format(*stats.mannwhitneyu(r_0, results[:,0])))

r_1 = np.random.normal(p, p*(1-p), len(results))
r_1 = r_1 * np.sign(r_1)
print("Percentage Mann Whitney U: {} (p={:.3E})".format(*stats.mannwhitneyu(r_1, results[:,1])))

Value Mann Whitney U: 425.0 (p=3.923E-07)
Percentage Mann Whitney U: 0.0 (p=7.371E-17)
