## Imports

In [0]:
!pip install sec-edgar-downloader



In [0]:
import sec_edgar_downloader
import google.colab as colab
import shutil
import os
import pandas as pd
import random
import numpy as np

import re
import requests
import unicodedata
from bs4 import BeautifulSoup

### Mount Google Drive

In [0]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

def get_all_files_from_dir(directory):

    '''
    # Functionality
    Get the whole list of files in a folder
	# Arguments
		directory: string.  (e.g.,Users/laowang/~vgg/data/flowers/102/102flowers.tgz)
	# Returns
		None
	'''
    file_paths = []
    for root, dirs, files in os.walk(directory):
        file_paths += [os.path.join(root, x) for x in files]
    return sorted(file_paths)

In [0]:
ROOT_DIR =  mount_google_drive() + "/ECE457B-Project/"
DATASET_DIR = ROOT_DIR + "sec8ks/"
print(DATASET_DIR)

Mounted at /content/gdrive
/content/gdrive/My Drive/ECE457B-Project/sec8ks/


## Retrieval from Internet

In [0]:
downloader = sec_edgar_downloader.Downloader(DATASET_DIR)

In [0]:
downloader.get("8-K", "CRM", after_date="20100101", before_date="20200220")

142

In [0]:
len(get_all_files_from_dir(DATASET_DIR))

93

## Parse 8-k reports

In [0]:
def extract_fin_info(file_name):
  with open(file_name, "r") as f:
    soup = BeautifulSoup(f.read(), "html")
    found = False

  date = str(soup.find("acceptance-datetime"))[21:29]

  for bold in soup.findAll('b'):
    if str(bold).find("Financial Statements and Exhibits") != -1:
      found = True
      break
  if found:
    for table in soup.findAll('table'):
      if str(table).find("Description") != -1: # Normally Item description
        sample = pd.read_html(str(table))[0]
        return list(sample[2].dropna()[1:]), date
  else:
    return "", date

In [0]:
results = []

for file_name in get_all_files_from_dir(DATASET_DIR + "sec_edgar_filings/CRM/"):
  try:
    sentences, date = extract_fin_info(file_name)
  except:
    pass
  if sentences:
    results.append([date, ". ".join(sentences)])
 

In [0]:
len(results)

47

In [0]:
results

[['20100119',
  'Indenture between salesforce.com, inc. and U.S. Bank National Association, dated as of January 19, 2010.. Purchase Agreement dated January 12, 2010 between salesforce.com, inc. and Merrill Lynch, Pierce, Fenner & Smith Incorporated, as representative of the several initial purchasers named in Schedule\xa0A thereto.. Form of Convertible Bond Hedge Confirmation. Form of Warrant Confirmation. Press Release, dated January 19, 2010, Announcing the Closing of the Offering of the Notes.'],
 ['20100119',
  'Indenture between salesforce.com, inc. and U.S. Bank National Association, dated as of January 19, 2010.. Purchase Agreement dated January 12, 2010 between salesforce.com, inc. and Merrill Lynch, Pierce, Fenner & Smith Incorporated, as representative of the several initial purchasers named in Schedule\xa0A thereto.. Form of Convertible Bond Hedge Confirmation. Form of Warrant Confirmation. Press Release, dated January 19, 2010, Announcing the Closing of the Offering of the 

In [0]:
np.save(ROOT_DIR + "sec.npy", np.array(results))