Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
266 lines (235 sloc) 9.93 KB
import urllib.parse
import base64
import struct
import binascii
import math
import argparse
import json
import os
def forgiving_b64_decode(x, pads=0):
padded = "{}{}".format(x, "_" * pads)
try:
return base64.b64decode(padded, "-_")
except binascii.Error as e:
if pads < 2:
return forgiving_b64_decode(x, pads + 1)
else:
raise e
encodings = {
'original': bytearray,
'signed_int': lambda x: struct.pack("q", int(x)),
'unsigned_int': lambda x: struct.pack("Q", int(x)),
'hex': binascii.unhexlify,
'base64': forgiving_b64_decode,
'ascii': lambda x: x.encode("ascii")
}
def parse(url):
parsed_url = urllib.parse.urlparse(url)
return {
"query": urllib.parse.parse_qs(parsed_url.query),
"route": parsed_url.path.split('/'),
"url": url
}
def cumsum(x):
return sum(x for x in range(x + 1))
def get_urls_from(file):
with open(file, "r") as url_file:
return (parse(x.rstrip("\r\n ])([.\\\'\"")) for x in url_file.readlines())
def categorize(url):
return "{} {}".format(len(url['route']), " ".join(sorted(url['query'].keys())))
def try_decode(sample, encoding):
try:
return (sample, encoding(sample))
except Exception:
return None
def smart_utf(x):
leading_zero = 0
while len(x) > leading_zero and x[leading_zero] == 0:
leading_zero += 1
trailing_zero = len(x) - 1
while trailing_zero >= leading_zero and x[trailing_zero] == 0:
trailing_zero -= 1
if trailing_zero == leading_zero+1:
return "NULL"
x = x[leading_zero:trailing_zero+1]
result = []
for y in x:
if y < 128:
try:
result += chr(y)
except:
pass
else:
result += "?"
return "".join(result)
def extract_groups(urls):
groups = {}
for url in urls:
category = categorize(url)
group = groups.get(category, {'urls': [], 'collection': {}})
group['urls'].append(url["url"])
for x in url['query']:
values = group['collection'].get(x, [])
values.append(url['query'][x][0])
group['collection'][x] = values
for i in range(0, len(url['route'])):
x = "@{}".format(i)
values = group['collection'].get(x, [])
values.append(url['route'][i])
group['collection'][x] = values
groups[category] = group
return groups
def decode(groups):
decoded = {}
for group_name in groups:
group = groups[group_name]
groups[group_name] = {x: group['collection'][x] for x in group['collection'] if
len(set(group['collection'][x])) > 1}
decoded[group_name] = {}
for group_name in groups:
group = groups[group_name]
for element in group:
decoded[group_name][element] = {}
for encoding in encodings:
decodings = [try_decode(sample, encodings[encoding]) for sample in group[element]]
decodings = [x for x in decodings if x]
if len(decodings) > 1:
decoded[group_name][element][encoding] = decodings
return decoded
def summarize(decoded, urls):
results = {}
for group in decoded:
results[group] = { "fields": {}, "urls": urls[group]}
for field in decoded[group]:
results[group]["fields"][field] = {}
for encoding in decoded[group][field]: # Each encoding / field combo
originals = [x[0] for x in decoded[group][field][encoding]]
elements = [x[1] for x in decoded[group][field][encoding]]
max_len = max(len(x) for x in elements)
for i in range(len(elements)):
elements[i] = b'\x00' * (max_len - len(elements[i])) + elements[i]
pmfs = []
for j in range(max_len): # Finds min/max/range of each field
pmfs.append({})
for element in elements:
pmfs[j][element[j]] = pmfs[j].get(element[j], 0) + 1
for support in pmfs[j]:
pmfs[j][support] /= len(elements)
entropy = 0
for j in range(max_len):
entropy += -1 * sum(pmfs[j][support] * math.log(pmfs[j][support], 2) for support in pmfs[j])
results[group]["fields"][field][encoding] = {
"n_elements": len(elements),
"entropy": entropy,
"elements": [binascii.hexlify(x) for x in elements],
"originals": originals
}
return results
def make_report(summary):
report = {}
for group in summary:
report[group] = {
"urls": summary[group]["urls"],
"fields": {}
}
group_entropy = 0
for field in summary[group]["fields"]:
report[group]["fields"][field] = []
entropy_frontier = {}
for encoding in summary[group]["fields"][field]:
n_elem = summary[group]["fields"][field][encoding]['n_elements']
entro = summary[group]["fields"][field][encoding]['entropy']
if n_elem not in entropy_frontier or entro < entropy_frontier[n_elem]["entropy"]:
entropy_frontier[n_elem] = summary[group]["fields"][field][encoding]
entropy_frontier[n_elem]["encoding"] = encoding
sorted_frontier = [x for x in entropy_frontier]; sorted_frontier.sort(reverse=True)
if len(sorted_frontier) == 0:
continue
entro = entropy_frontier[sorted_frontier[0]]["entropy"]
for y in sorted_frontier[1:]:
if entropy_frontier[y]["entropy"] >= entro:
entropy_frontier.pop(y)
else:
entro = entropy_frontier[y]["entropy"]
sorted_frontier = [x for x in entropy_frontier]; sorted_frontier.sort()
min_entropy = None
for frontier_index in sorted_frontier:
frontier_element = entropy_frontier[frontier_index]
tokens = frontier_element["elements"]
originals = frontier_element["originals"]
encoded_samples = []
for token_index in range(0, len(tokens)):
token = tokens[token_index]
original = originals[token_index]
bytes = binascii.unhexlify(token)
as_utf = smart_utf(bytes)
encoded_samples.append({
"hexlified": ' '.join('{:02x}'.format(x) for x in binascii.unhexlify(token)),
"ascii": as_utf,
"original": original
})
report[group]["fields"][field].append({
"count": frontier_index,
"entropy": frontier_element["entropy"],
"encoding": frontier_element["encoding"],
"samples": encoded_samples
})
min_entropy = frontier_element["entropy"] if min_entropy == None or min_entropy > frontier_element["entropy"] else min_entropy
group_entropy += min_entropy
report[group]["entropy"] = group_entropy
return report
def report_to_text(report, n_elem_to_show, n_url_to_show):
lines = []
for group in report:
lines += "==== %s ====\n Sample URLs:\n" % group
for url in report[group]["urls"][:n_url_to_show]:
lines += " * %s\n" % url
lines += "\n Entropy: %5.1f\n" % report[group]["entropy"]
for field in report[group]["fields"]:
for f in report[group]["fields"][field]:
lines += " **** %s - %s ****\n" % (field, f["encoding"])
lines += " Token Entropy: %5.2f\n" % f["entropy"]
lines += " Decode Count: %d\n" % f["count"]
lines += " Sample Tokens:\n"
for sample in f["samples"][:n_elem_to_show]:
lines += " %s\n" % sample["original"]
lines += " %s\n" % sample["hexlified"]
lines += " %s\n\n" % sample["ascii"]
return "".join(lines)
def report_to_json(report):
return json.dumps(report)
def process(urls_path):
groups = extract_groups(get_urls_from(urls_path))
urls = { name: groups[name]['urls'] for name in groups }
decoded = decode(groups)
summary = summarize(decoded, urls)
report = make_report(summary)
return report
parser = argparse.ArgumentParser(description='Computes entropy of URLs')
parser.add_argument('-s','--search', help='Input is a directory. Write multiple outputs to this directory.', default=None)
parser.add_argument('-t','--text', help='Text output. (Default is JSON.)', action="store_true")
parser.add_argument('-e','--elem', help='Number of token samples to print (text only)', default=5, type=int)
parser.add_argument('-u','--url', help='Number of url samples to print (text only)', default=5, type=int)
parser.add_argument('file', nargs=1, help='File containing URL samples')
args = parser.parse_args()
if args.search:
dir = args.file[0]
for f in os.listdir(dir):
path = os.path.join(dir, f)
report = process(path)
for group in report:
ent = report[group]["entropy"]
print("%s,%s,%5.2f" % (path, group, report[group]["entropy"]))
path_prefix = os.path.join(args.search, f)
if args.text:
with open("%s.txt" % path_prefix, "w") as of:
of.write(report_to_text(report, args.elem, args.url))
else:
with open("%s.json" % path_prefix, "w") as of:
of.write(report_to_json(report))
else:
report = process(args.file[0])
if args.text:
print(report_to_text(report, args.elem, args.url))
else:
print(report_to_json(report))