## Imports

In [0]:
!pip install sec-edgar-downloader



In [0]:
import sec_edgar_downloader
import google.colab as colab
import shutil
import os
import pandas as pd
import random
from IPython.display import display_html

import re
import requests
import unicodedata
from bs4 import BeautifulSoup
random.seed(716) 

### Mount Google Drive

In [0]:
def mount_google_drive():
	'''
	# Functionality
		Mount google drive. Since colab does not save files, we want to make it easier to directly access files in google drive.
	# Arguments
		Nothing
	# Returns
		drive_root: the working directory mounted
	'''
	mount_directory = "/content/gdrive"
	drive = colab.drive
	drive.mount(mount_directory, force_remount=True)
	drive_root = mount_directory + "/" + list(filter(lambda x: x[0] != '.', os.listdir(mount_directory)))[0]
	return drive_root

def get_all_files_from_dir(directory):

    '''
    # Functionality
    Get the whole list of files in a folder
	# Arguments
		directory: string.  (e.g.,Users/laowang/~vgg/data/flowers/102/102flowers.tgz)
	# Returns
		None
	'''
    file_paths = []
    for root, dirs, files in os.walk(directory):
        file_paths += [os.path.join(root, x) for x in files]
    return sorted(file_paths)

In [3]:
ROOT_DIR =  mount_google_drive() + "/ECE457B-Project/"
DATASET_DIR = ROOT_DIR + "sec8ks/"
print(DATASET_DIR)

Mounted at /content/gdrive
/content/gdrive/My Drive/ECE457B-Project/sec8ks/


## Retrieval from Internet

In [0]:
downloader = sec_edgar_downloader.Downloader(DATASET_DIR)

In [0]:
# downloader.get("8-K", "CRM", after_date="20120103", before_date="20191230")

## Parse 8-k reports

In [0]:
# file_name = DATASET_DIR + "sec_edgar_filings/GOOGL/8-K/0001652044-16-000035.txt"

In [0]:
# with open(file_name, "r") as f:
#     soup = BeautifulSoup(f.read(), "html")
#     found = False
# date = str(soup.find("acceptance-datetime"))[21:29]
# date

In [0]:
# for table in soup.findAll('table'):
#   if str(table).find("Financial Statements and Exhibits") != -1:
#     found = True

#   if found:
#     if str(table).find("Description") != -1: # Normally Item description
#       sample = pd.read_html(str(table))[0]
#       print(list(sample[1].dropna()[1:]), date)
#       break

In [0]:
# def extract_fin_info(file_name):
#   with open(file_name, "r") as f:
#     soup = BeautifulSoup(f.read(), "html")
#     found = False

#   date = str(soup.find("acceptance-datetime"))[21:29]

#   for table in soup.findAll('table'):
#     if str(table).find("Financial Statements and Exhibits") != -1:
#       found = True

#     if found:
#       if str(table).find("Description") != -1: # Normally Item description
#         sample = pd.read_html(str(table))[0]
#         return list(sample[1].dropna()[1:]), date

def extract_fin_info(file_name):
  with open(file_name, "r") as f:
    soup = BeautifulSoup(f.read(), "html")
    found = False

  date = str(soup.find("acceptance-datetime"))[21:29]

  for bold in soup.findAll('b'):
    if str(bold).find("Financial Statements and Exhibits") != -1:
      found = True
      break
  if found:
    for table in soup.findAll('table'):
      if str(table).find("Description") != -1: # Normally Item description
        sample = pd.read_html(str(table))[0]
        return list(sample[2].dropna()[1:]), date
  else:
    return "Empty", date

In [0]:
results = []
dates = []

for file_name in get_all_files_from_dir(DATASET_DIR + "sec_edgar_filings/CRM/"):
  try:
    result, date = extract_fin_info(file_name)
  except:
    # print(file_name)
    pass
  results.append(result)
  dates.append(date)

In [11]:
len(get_all_files_from_dir(DATASET_DIR))

154

In [12]:
len(results)

117

In [13]:
results

['Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 'Empty',
 ['Agreement and Plan of Merger dated June 3, 2012, by and among salesforce.com, inc., Bullseye Merger Corporation, Buddy Media, Inc., and Shareholder Representative Services LLC'],
 ['Agreement and Plan of Merger dated June 3, 2012, by and among salesforce.com, inc., Bullseye Merger Corporation, Buddy Media, Inc., and Shareholder Representative Services LLC'],
 ['Agreement and Plan of Merger dated June 3, 2012, by a

In [14]:
dates

['20191105',
 '20191122',
 '20140821',
 '20140821',
 '20141112',
 '20141119',
 '20141119',
 '20141219',
 '20150225',
 '20150225',
 '20150225',
 '20150225',
 '20150225',
 '20151124',
 '20151223',
 '20160202',
 '20160224',
 '20160419',
 '20160518',
 '20160711',
 '20160801',
 '20160831',
 '20160831',
 '20161003',
 '20161003',
 '20161117',
 '20161128',
 '20170228',
 '20170518',
 '20170822',
 '20170908',
 '20171106',
 '20171106',
 '20171121',
 '20171121',
 '20171218',
 '20180223',
 '20180228',
 '20180326',
 '20180402',
 '20180529',
 '20180829',
 '20180907',
 '20180907',
 '20181127',
 '20190304',
 '20190415',
 '20190604',
 '20190607',
 '20190807',
 '20190822',
 '20190822',
 '20191203',
 '20120106',
 '20120106',
 '20120302',
 '20120330',
 '20120330',
 '20120604',
 '20120604',
 '20120813',
 '20120813',
 '20120827',
 '20120827',
 '20121130',
 '20130222',
 '20130222',
 '20130313',
 '20130318',
 '20130321',
 '20130329',
 '20130329',
 '20130604',
 '20130611',
 '20130715',
 '20130715',
 '20130912',

In [0]:
assert len(dates) == len(results)