Skip to content

Commit

Permalink
Cleaned up sitemap ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
battleoverflow committed Jun 13, 2023
1 parent d91e6f1 commit 5dc79f6
Showing 1 changed file with 44 additions and 56 deletions.
100 changes: 44 additions & 56 deletions threatingestor/sources/sitemap.py
Expand Up @@ -5,81 +5,69 @@
from urllib.parse import urlparse

from threatingestor.sources import Source
import threatingestor.artifacts

class Plugin(Source):

def __init__(self, name, url, filter=None, path=None):
self.name = name
self.url = url
self.filter = filter
self.path = path

def run(self, saved_state):
saved_state = datetime.datetime.utcnow().isoformat()[:-7] + "Z"

# Configures sitemap parsing
response = requests.get(self.url)
xml = BeautifulSoup(response.text, "lxml-xml")

sitemap = xml.find_all("urlset")

try:
sitemap_db = []
urls = xml.find_all("url")
artifacts_list = []

for s in sitemap:
sitemap_db.append(s.findNext("loc").text)

except UnboundLocalError:
sitemap_db = [self.url]
for u in urls:

urls = xml.find_all("url")
artifacts = []
# Extracts only the 'loc' tag from the xml
if xml.find("loc"):
loc = u.findNext("loc").text
else:
loc = ""

for sitemap in sitemap_db:
for url in urls:
if self.filter is not None:
# Regex input via config.yml
# Example: security|threat|malware
xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))

# Extracts only the 'loc' tag from the xml
if xml.find("loc"):
loc = url.findNext("loc").text
parsed_uri = urlparse(loc)
domain = "{uri.netloc}".format(uri=parsed_uri)
else:
loc = ""
domain = ""
# Iterates over the regex output to locate all provided keywords
for x in xml_query:
# Uses a path instead of a keyword
if self.path is not None:
if self.path in loc:
artifacts_list.append(loc)

# Only filters using a keyword
if self.path is None:
if x in loc:
artifacts_list.append(loc)

elif self.filter is None and self.path is not None:
# Filters only by path in XML loc, no set filter
# Default: /path/name/*

row = {
"domain": domain,
"loc": loc
}
if self.path in loc:
artifacts_list.append(loc)

else:
# Locates all blog links within the sitemap
if "blog" in loc:
artifacts_list.append(loc)

if self.filter is not None:
# Regex input via config.yml
# Example: security|threat|malware
xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
artifacts = []

# Iterates over the regex output to locate all provided keywords
for x in xml_query:
# Uses a path instead of a keyword
if self.path is not None:
for a in artifacts_list:
description = 'URL: {u}\nTask autogenerated by ThreatIngestor from source: {s}'.format(u=a, s=self.name)
artifact = threatingestor.artifacts.URL(a, self.name, reference_link=self.url, reference_text=description)
artifacts.append(artifact)

if self.path in row["loc"]:
artifacts += self.process_element(row["loc"], self.url)

# Only filters using a keyword
if self.path is None:
if x in row["loc"]:
artifacts += self.process_element(row["loc"], self.url)

elif self.filter is None and self.path is not None:
# Filters only by path in XML loc, no set filter
# Default: /path/name/*
# print(artifacts)
saved_state = datetime.datetime.utcnow().isoformat()[:-7] + "Z"

if self.path in row["loc"]:
artifacts += self.process_element(row["loc"], self.url)

else:
# Locates all blog links within the sitemap
if "blog" in row["loc"]:
artifacts += self.process_element(row["loc"], self.url)

return saved_state, artifacts
return saved_state, artifacts

0 comments on commit 5dc79f6

Please sign in to comment.