# Scraping EDGAR - specific filling (adsh)
#### adsh Accession Number. The 20-character string formed from the 18-digit number assigned by the SEC to each EDGAR submission.
#### can be compared to https://www.sec.gov/dera/data/financial-statement-data-sets.html

## Step 1. Extracting ahef links from page

In [1]:
from urllib.request import Request, urlopen

In [2]:
from bs4 import BeautifulSoup as bs

In [3]:
import requests

In [4]:
def get_data(link):
    hdr = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36'}

    req = requests.get(link,headers=hdr)
    content = req.content
  
    return content

In [5]:
# check specific filling

In [6]:
url = 'https://www.sec.gov/Archives/edgar/data/0000831001/000110465921050513/0001104659-21-050513-index.html'

## Step 2. Extract link for each document

In [7]:
data_list = []

In [8]:
html = get_data(url)
soup = bs(html, "html.parser")
rows = soup.find_all("tr")
for row in rows:
    row_td = row.find_all("td")
    for file in row_td:
        main_links = file.find_all('a')
        for data in main_links:
            data_list.append(data['href'])

In [9]:
len(data_list)

13

In [10]:
data_list[:]

['/ix?doc=/Archives/edgar/data/831001/000110465921050513/c-20210210x8k.htm',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d1charter.htm',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d2bylaws.htm',
 '/ix?doc=/Archives/edgar/data/831001/000110465921050513/c-20210210xex993voti.htm',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d2bylaws001.jpg',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d1charter001.jpg',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d1charter002.jpg',
 '/Archives/edgar/data/831001/000110465921050513/0001104659-21-050513.txt',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210.xsd',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210_def.xml',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210_lab.xml',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210_pre.xml',
 '/Archives/edgar/data/831001/000110465921050513/c-20210210x8k_htm.xml']

In [11]:
#data_list

## Step 3. Separate links with different file extensions

In [12]:
import os

In [13]:
file_htm = []
file_xml = []
file_html = []
file_gif = []
file_txt = []
file_jpg = []

In [14]:
os.path.splitext(data_list[1])[1]

'.htm'

In [15]:
for extensions in data_list:
    case = os.path.splitext(extensions)[1]
    if case == '.htm':
        file_htm.append(extensions)
    elif case == '.html':
        file_html.append(extensions)
    elif case == '.xml':
        file_xml.append(extensions) 
    elif case == '.gif':
        file_gif.append(extensions)
    elif case ==  '.txt':
        file_txt.append(extensions)
    elif case ==  '.jpg':
        file_jpg.append(extensions)

In [16]:
len(file_xml)

4

In [17]:
file_xml[1]

'/Archives/edgar/data/831001/000110465921050513/c-20210210_lab.xml'

## Step 4. Converting htm links to text files

In [18]:
if(os.path.isdir('../txt_files_main')):
    print("directory already exists")
else:
    os.mkdir('../txt_files_main')

In [19]:
txt_no = 1

In [20]:
for htmlfiles in file_htm:
    html = get_data("https://www.sec.gov" + htmlfiles)
    print("https://www.sec.gov" + htmlfiles) # for debugging to see which htm will be saved as text file 
    soup = bs(html, features = "html.parser")
    for script in soup(["script","style"]):
        script.extract()
    text = soup.get_text()
        # break into lines
    lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
    chunks = (pharse.strip() for line in lines for pharse in line.split("  "))
        # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    f = open("../txt_files_main/test"+str(txt_no)+".txt","w",encoding="utf-8")
    f.write(text)
    txt_no+=1
    f.close()

https://www.sec.gov/ix?doc=/Archives/edgar/data/831001/000110465921050513/c-20210210x8k.htm
https://www.sec.gov/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d1charter.htm
https://www.sec.gov/Archives/edgar/data/831001/000110465921050513/c-20210210xex99d2bylaws.htm
https://www.sec.gov/ix?doc=/Archives/edgar/data/831001/000110465921050513/c-20210210xex993voti.htm


## Step 5. Converting xml links to text files

In [21]:
if(os.path.isdir('../txt_files_main_xml')):
    print("directory already exists")
else:
    os.mkdir('../txt_files_main_xml')

In [22]:
txt_no = 1000

In [23]:
for htmlfiles in file_xml:
    html = get_data("https://www.sec.gov" + htmlfiles)
    #print("https://www.sec.gov" + htmlfiles) # for debugging to see which htm will be saved as text file 
    soup = bs(html, features = "html.parser")
    for script in soup(["script","style"]):
        script.extract()
    text = soup.get_text()
        # break into lines
    lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
    chunks = (pharse.strip() for line in lines for pharse in line.split("  "))
        # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    f = open("../txt_files_main_xml/test"+str(txt_no)+".txt","w",encoding="utf-8")
    f.write(text)
    txt_no+=1
    f.close()

## Step 6. Saving text files

In [24]:
if(os.path.isdir('../txt_files_main_txt')):
    print("directory already exists")
else:
    os.mkdir('../txt_files_main_txt')

In [25]:
txt_no = 100

In [26]:
for htmlfiles in file_txt:
    html = get_data("https://www.sec.gov" + htmlfiles)
    #print("https://www.sec.gov" + htmlfiles) # for debugging to see which htm will be saved as text file 
    soup = bs(html, features = "html.parser")
    for script in soup(["script","style"]):
        script.extract()
    text = soup.get_text()
        # break into lines
    lines = (line.strip() for line in text.splitlines())
        # break multi-headlines into a line each
    chunks = (pharse.strip() for line in lines for pharse in line.split("  "))
        # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    f = open("../txt_files_main_txt/test"+str(txt_no)+".txt","w",encoding="utf-8")
    f.write(text)
    txt_no+=1
    f.close()