Cleaned up sitemap ingestion

InQuest · Jun 13, 2023 · 5dc79f6 · 5dc79f6
1 parent d91e6f1
commit 5dc79f6
Showing 1 changed file with 44 additions and 56 deletions.
diff --git a/threatingestor/sources/sitemap.py b/threatingestor/sources/sitemap.py
@@ -5,81 +5,69 @@
 from urllib.parse import urlparse
 
 from threatingestor.sources import Source
+import threatingestor.artifacts
 
 class Plugin(Source):
-    
+
     def __init__(self, name, url, filter=None, path=None):
         self.name = name
         self.url = url
         self.filter = filter
         self.path = path
 
     def run(self, saved_state):
-        saved_state = datetime.datetime.utcnow().isoformat()[:-7] + "Z"
-
         # Configures sitemap parsing
         response = requests.get(self.url)
         xml = BeautifulSoup(response.text, "lxml-xml")
-
-        sitemap = xml.find_all("urlset")
 
-        try:
-            sitemap_db = []
+        urls = xml.find_all("url")
+        artifacts_list = []
 
-            for s in sitemap:
-                sitemap_db.append(s.findNext("loc").text)
-
-        except UnboundLocalError:
-            sitemap_db = [self.url]
+        for u in urls:
 
-        urls = xml.find_all("url")
-        artifacts = []
+            # Extracts only the 'loc' tag from the xml
+            if xml.find("loc"):
+                loc = u.findNext("loc").text
+            else:
+                loc = ""
 
-        for sitemap in sitemap_db:
-            for url in urls:
+            if self.filter is not None:
+                # Regex input via config.yml
+                # Example: security|threat|malware
+                xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
 
-                # Extracts only the 'loc' tag from the xml
-                if xml.find("loc"):
-                    loc = url.findNext("loc").text
-                    parsed_uri = urlparse(loc)
-                    domain = "{uri.netloc}".format(uri=parsed_uri)
-                else:
-                    loc = ""
-                    domain = ""
+                # Iterates over the regex output to locate all provided keywords
+                for x in xml_query:
+                    # Uses a path instead of a keyword
+                    if self.path is not None:
+                        if self.path in loc:
+                            artifacts_list.append(loc)
+
+                    # Only filters using a keyword
+                    if self.path is None:
+                        if x in loc:
+                            artifacts_list.append(loc)
+
+            elif self.filter is None and self.path is not None:
+                # Filters only by path in XML loc, no set filter
+                # Default: /path/name/*
 
-                row = {
-                    "domain": domain,
-                    "loc": loc
-                }
+                if self.path in loc:
+                    artifacts_list.append(loc)
+
+            else:
+                # Locates all blog links within the sitemap
+                if "blog" in loc:
+                    artifacts_list.append(loc)
 
-                if self.filter is not None:
-                    # Regex input via config.yml
-                    # Example: security|threat|malware
-                    xml_query = re.compile(r"{0}".format(self.filter)).findall(str(self.filter.split('|')))
+        artifacts = []
 
-                    # Iterates over the regex output to locate all provided keywords
-                    for x in xml_query:
-                        # Uses a path instead of a keyword
-                        if self.path is not None:
+        for a in artifacts_list:
+            description = 'URL: {u}\nTask autogenerated by ThreatIngestor from source: {s}'.format(u=a, s=self.name)
+            artifact = threatingestor.artifacts.URL(a, self.name, reference_link=self.url, reference_text=description)
+            artifacts.append(artifact)
 
-                            if self.path in row["loc"]:
-                                artifacts += self.process_element(row["loc"], self.url)
-
-                        # Only filters using a keyword
-                        if self.path is None:
-                            if x in row["loc"]:
-                                artifacts += self.process_element(row["loc"], self.url)
-
-                elif self.filter is None and self.path is not None:
-                    # Filters only by path in XML loc, no set filter
-                    # Default: /path/name/*
+        # print(artifacts)
+        saved_state = datetime.datetime.utcnow().isoformat()[:-7] + "Z"
 
-                    if self.path in row["loc"]:
-                        artifacts += self.process_element(row["loc"], self.url)
-
-                else:
-                    # Locates all blog links within the sitemap
-                    if "blog" in row["loc"]:
-                        artifacts += self.process_element(row["loc"], self.url)
-
-        return saved_state, artifacts
+        return saved_state, artifacts