In [1]:
import requests
import re
import time
import random
from zipfile import ZipFile
from bs4 import BeautifulSoup

# 0 Collect links and file names

In [2]:
# Collect all links to pages 
urls=[]
# manual put the number of pages on CERT
for i in range(57):
  urls.append("https://www.us-cert.gov/ics/advisories?page={}".format(i))

In [3]:
# Collect all links to reports
docs=[]
for i in urls:
  page = requests.get(i)
  soup = BeautifulSoup(page.text, "html.parser", from_encoding='utf-8')
  # Get links to reports on each page
  for div in soup.find_all(name='span', attrs={'class':'field-content'}, href=False):
    for subdiv in div.find_all(name='a',href=True):
      name = subdiv['href']    
      link_2_doc = 'https://www.us-cert.gov{}'.format(name)
      docs.append(link_2_doc)



In [4]:
# Save file names for later use
file_names=[]
for i in docs:
  file_names.append(i.split('/')[-1]+'.txt')
# write file names into a .txt file
with open("FILES.txt", "w") as f:
  for i in file_names:
    f.write(i+'\n')

# 1 Collect Text

In [5]:
# utilities for text collection
nest_layer_1 = ['li','h2','h3','h4','p','ol','div','ul']
nest_layer_2 = ['h2','h3','h4','p','ol','div','ul']
name = ['h2','h3','h4','p','ol','div','ul']
collect_tag = ['h2','h3','h4','p','ol','ul','div']

In [6]:
# A function to remove footnotes by converting it to footnote tags
def remove_foot(cont,append_flag,soup):

  footnote_flag = 0

  # use footnote to indicate the indexes of footnotes
  footnote=" [Footnote:"
  # collect and clean footnotes
  for ft in cont.find_all(name='a', attrs = {'class':"see-footnote"}):
    # record the indexes of footnotes
    footnote_flag = 1
    footnote = footnote + ft.get_text() + ','
    # replace the footnode with tags with no content
    a_tag = ft
    new_tag = soup.new_tag("b")
    new_tag.string = ""
    a_tag.replace_with(new_tag)
  # remove surplus comma and close the bracket
  footnote = footnote[:-1]+']'
  
  # clean symbols between footnotes
  for in_sup in cont.find_all(name='sup'):
    # replace the symbol with no-content tags
    sup_tag = in_sup
    n_tag = soup.new_tag("c")
    n_tag.string = ""
    sup_tag.replace_with(n_tag)

  # replace 'br' section with new line indicator
  for line_br in cont.find_all(name='br'):
    sup_tag = line_br
    n_tag = soup.new_tag("bk")
    n_tag.string = "\n"
    sup_tag.replace_with(n_tag)

  return cont.get_text()+footnote_flag*footnote

In [7]:
# A function to remove tags from a text body
def remove_tag(elem,tags,soup):

  for tag in tags:
    for incl_tag in elem.find_all(name=tag):
      spl_tag = incl_tag
      n_tag = soup.new_tag("clr")
      n_tag.string = ""
      spl_tag.replace_with(n_tag)
      
  return elem

In [8]:
# A function to remove nested sections to avoid duplicates
def remove_nest(cont,append_flag,soup):

  content = ['']
  
  ul_flag = 0
  
  for elem in cont.find_all(name=nest_layer_1, href=False):

    elem_cont = elem
    if elem.name != 'li':
      ul_flag += 1

    if len(elem.find_all(name=nest_layer_2, href=False)): 
      elem_cont = remove_tag(elem_cont,name,soup)

    if elem.name == 'ul':
      continue
    
    line = remove_foot(elem, append_flag, soup)

    if elem.name == 'li':
      line = '·' + line
    content.append(line)
  
  name.append('li')
  cont = remove_tag(cont,name,soup)
  content[0] = remove_foot(cont, append_flag, soup)
  if re.sub(r'\n','',content[0]) == '':
    content[0] = re.sub(r'\n','',content[0])
  if content[0] != '':
    content = ['']+content
  content.append(ul_flag)

  return content

In [9]:
# Define collect_doc that derive reports from each link
def collect_doc(link):
  
  # use div_count to only collect text from report part 
  div_count = 1
  # initialize content to contain all the texts in this report
  content = []

  page = requests.get(link)
  soup = BeautifulSoup(page.text, "html.parser", from_encoding='utf-8')
  file_name = link.split("/")[-1]

  for div_t in soup.find_all(name='div', attrs={'id':'submitted meta-text'}):
    info_date = div_t.get_text().split()[5]
    if info_date > 2011:
      files_in_range.append("{}.txt".format(file_name))

  with open("{}.txt".format(file_name), "w") as f:
    # collect the title of the report
    for div_f in soup.find_all(name='div', attrs={'id':'ncas-header'}):
      # find the section that contains title information
      for title in div_f.find_all(name=['h1','h2'], href=False):
        titles = [title.get_text()+'\n'*(len(content)==0)]
        content = content + titles
    # collect the whole report section
    for div in soup.find_all(name='div', attrs={'class':'field field--name-body field--type-text-with-summary field--label-hidden field--item'}):
      # use ul_flag to avoid duplicate texts
      ul_flag = 0
      # use append_flag to only collect text from report part 
      append_flag = 1
      if div_count != 5:
        append_flag = 0
      div_count += 1
      # collect lines with specific tags that corresponds with content
      for cont in div.find_all(name=collect_tag, href=False):
        # use bulletings to contain bulleting items
        bulletings=[]
        # decrease ul_flag and skip the line to avoid duplicates
        if ul_flag:
          ul_flag -= 1
          continue

        # collect paragraph that is not bulleting
        if append_flag:
          paragraph = remove_nest(cont, append_flag, soup)
          if paragraph[-1]:
            ul_flag += paragraph[-1]
          # add paragraph to content 
          content = content + paragraph[:-1]
    
    # write the txt file
    for i in content:
      if type(i) == int:
        continue
      f.write(i+"\n"*(content.index(i)<len(content)-1))

# 2 Run the process

In [None]:
# Collect reports from links
for i in docs:
  print(i)
  collect_doc(i)

In [20]:
# Record the time of reports
files_in_range = {}
for link in docs:
  page = requests.get(link)
  time.sleep(0.3+random.randint(1,5)/10)
  soup = BeautifulSoup(page.text, "html.parser", from_encoding='utf-8')
  file_name = link.split("/")[-1]
  for div_t in soup.find_all(name='div', attrs={'class':'submitted meta-text'}):
    info_date = div_t.get_text().split()[5]   
    files_in_range["{}.txt".format(file_name)] = info_date



In [21]:
# Remove reports generated before 2011
with open("valid_files.txt", "w") as f:
  for i in files_in_range.items():
    if int(i[1])>2011:
      f.write(i[0]+'\n')

In [14]:
# Zip all the reports
with ZipFile("reports.zip","w") as newZip:
  for i in docs:
    name = i.split("/")[-1]
    file_name = "{}.txt".format(name)
    newZip.write(file_name)