Skip to content

Commit

Permalink
Now accepts raw regex for sitemap and rss exclusion
Browse files Browse the repository at this point in the history
  • Loading branch information
battleoverflow committed Nov 1, 2023
1 parent 6ffff46 commit 8c8481b
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 31 deletions.
4 changes: 2 additions & 2 deletions config.example.yml
Expand Up @@ -94,7 +94,7 @@ sources:
module: rss
url: https://inquest.net/blog/rss
feed_type: messy
exclude: security|threat|research
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?

# Sitemap exmaples

Expand All @@ -108,7 +108,7 @@ sources:
- name: inquest-sitemap-articles
module: sitemap
url: https://www.inquest.net/sitemap.xml
exclude: security|threat|research
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?

# Defaults to "blog" keyword
- name: inquest-sitemap-blog
Expand Down
6 changes: 4 additions & 2 deletions docs/sources/rss.rst
Expand Up @@ -21,7 +21,8 @@ Configuration Options
* ``module`` (required): ``rss``
* ``url`` (required): URL to the RSS or Atom feed.
* ``feed_type`` (required): see above; if unsure, use ``messy``.
* ``filter`` (optional): Regex filtering for RSS feed.
* ``include`` (optional): Include filter using simplified regex.
* ``exclude`` (optional): Exclude filter using raw regex.

Example Configuration
~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -34,7 +35,8 @@ Inside the ``sources`` section of your configuration file:
module: rss
url: https://example.com/rss.xml
feed_type: messy
filter: security|threat
include: security|threat
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
.. _sqs-source:

4 changes: 4 additions & 0 deletions docs/sources/sitemap.rst
Expand Up @@ -10,6 +10,8 @@ Configuration Options

* ``module`` (required): ``sitemap``
* ``url`` (required): URL of the website with the sitemap path.
* ``include`` (optional): Include filter using simplified regex.
* ``exclude`` (optional): Exclude filter using raw regex.

Example Configuration
~~~~~~~~~~~~~~~~~~~~~
Expand All @@ -21,3 +23,5 @@ Quick setup for sitemap parsing:
- name: inquest-blog
module: sitemap
url: https://inquest.net/sitemap.xml
include: security|threat|research
exclude: https:\/.inquest\.net\/blog[\/]?inquest-[\/]?
27 changes: 13 additions & 14 deletions threatingestor/sources/rss.py
Expand Up @@ -51,20 +51,19 @@ def run(self, saved_state):
text = ""

if self.exclude is not None:
rss_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))

for rss_e in rss_exclude:
if rss_e not in item.get('link'):
if self.feed_type == "afterioc":
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
elif self.feed_type == "clean":
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
else:
# Default: self.feed_type == 'messy'.
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link'))
rss_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(item.get('link')))

if rss_exclude:
if self.feed_type == "afterioc":
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
elif self.feed_type == "clean":
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link'), include_nonobfuscated=True)
else:
# Default: self.feed_type == 'messy'.
text = soup.get_text(separator=' ')
artifacts += self.process_element(text, item.get('link'))

if self.include is not None:
rss_include = re.compile(r"{0}".format(self.include)).findall(str(self.include.split('|')))
Expand Down
22 changes: 9 additions & 13 deletions threatingestor/sources/sitemap.py
Expand Up @@ -50,20 +50,16 @@ def run(self, saved_state):

if self.exclude is not None:
# Regex input via config.yml
# Example: security|threat|malware
xml_exclude = re.compile(r"{0}".format(self.exclude)).findall(str(self.exclude.split('|')))
xml_exclude = re.sub(re.compile(fr"{self.exclude}", re.IGNORECASE), "", str(loc))

if xml_exclude:
if self.path is None and "http" in xml_exclude:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

# Iterates over the regex output to locate all provided keywords
for xe in xml_exclude:
# Uses a path instead of a keyword
if self.path is not None:
if self.path in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

# Only filters using a keyword
if self.path is None:
if xe not in loc:
if self.path in xml_exclude:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

Expand All @@ -86,15 +82,15 @@ def run(self, saved_state):
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

if self.include is None or self.exclude is None and self.path is not None:
if self.include is None and self.exclude is None and self.path is not None:
# Filters only by path in XML loc, no set include
# Default: /path/name/*

if self.path in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
artifacts += self.process_element(content=text, reference_link=str(loc), include_nonobfuscated=True)

if self.include is None and self.path is None and self.exclude is None:
if self.include is None and self.exclude is None and self.path is None:
# Locates all blog links within the sitemap
if "blog" in loc:
text = soup.get_text(separator=' ').split('Indicators of Compromise')[-1]
Expand Down

0 comments on commit 8c8481b

Please sign in to comment.