In [3]:
# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [5]:
with open("/root/sample/sec-edgar-filings/0000320193/10-Q/0000320193-17-000009/full-submission.txt") as fp:
    raw_10q = fp.read()
    print(raw_10q[0:1300])

<SEC-DOCUMENT>0000320193-17-000009.txt : 20170802
<SEC-HEADER>0000320193-17-000009.hdr.sgml : 20170802
<ACCEPTANCE-DATETIME>20170802163128
ACCESSION NUMBER:		0000320193-17-000009
CONFORMED SUBMISSION TYPE:	10-Q
PUBLIC DOCUMENT COUNT:		72
CONFORMED PERIOD OF REPORT:	20170701
FILED AS OF DATE:		20170802
DATE AS OF CHANGE:		20170802

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			APPLE INC
		CENTRAL INDEX KEY:			0000320193
		STANDARD INDUSTRIAL CLASSIFICATION:	ELECTRONIC COMPUTERS [3571]
		IRS NUMBER:				942404110
		STATE OF INCORPORATION:			CA
		FISCAL YEAR END:			0930

	FILING VALUES:
		FORM TYPE:		10-Q
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-36743
		FILM NUMBER:		171000359

	BUSINESS ADDRESS:	
		STREET 1:		ONE INFINITE LOOP
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014
		BUSINESS PHONE:		(408) 996-1010

	MAIL ADDRESS:	
		STREET 1:		ONE INFINITE LOOP
		CITY:			CUPERTINO
		STATE:			CA
		ZIP:			95014

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	APPLE COMPUTER INC
		DATE OF NAME

In [6]:
# STEP 3 : Apply REGEXes to find 10-K Section from the document

# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

# Create 3 lists with the span idices for each regex

### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
### First filter will give us document tag start <end> and document tag end's <start> 
### We will use this to later grab content in between these tags
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10q)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10q)]

### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
### as section names
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10q)]

In [7]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-Q':
        document[doc_type] = raw_10q[doc_start:doc_end]

In [8]:
# display excerpt the document
document['10-Q'][0:500]

'\n<TYPE>10-Q\n<SEQUENCE>1\n<FILENAME>a10-qq32017712017.htm\n<DESCRIPTION>10-Q\n<TEXT>\n<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">\n<html>\n\t<head>\n\t\t<!-- Document created using Wdesk 1 -->\n\t\t<!-- Copyright 2017 Workiva -->\n\t\t<title>Document</title>\n\t</head>\n\t<body style="font-family:Times New Roman;font-size:10pt;">\n<div><a name="s6F5A529DE2B953B591D9B392DAF0FBAE"></a></div><div style="line-height:120%;text-align:center;font-size:10pt;"><div st'

In [9]:
# STEP 4 : Apply REGEXes to find Item 1A, 7, and 7A under 10-K Section

# Write the regex
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1|2|3|4)\.{0,1})|(ITEM\s(1A|1|2|3|4))|(PART\s(II|I))')

# Use finditer to math the regex
matches = regex.finditer(document['10-Q'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(25769, 25782), match='>Item&#160;1.'>
<re.Match object; span=(26683, 26696), match='>Item&#160;2.'>
<re.Match object; span=(27645, 27658), match='>Item&#160;3.'>
<re.Match object; span=(28574, 28587), match='>Item&#160;4.'>
<re.Match object; span=(30123, 30136), match='>Item&#160;1.'>
<re.Match object; span=(31011, 31025), match='>Item&#160;1A.'>
<re.Match object; span=(31895, 31908), match='>Item&#160;2.'>
<re.Match object; span=(32825, 32838), match='>Item&#160;3.'>
<re.Match object; span=(33727, 33740), match='>Item&#160;4.'>
<re.Match object; span=(36487, 36493), match='PART I'>
<re.Match object; span=(36877, 36885), match='>Item 1.'>
<re.Match object; span=(1271318, 1271326), match='>Item 2.'>
<re.Match object; span=(1708652, 1708660), match='>Item 3.'>
<re.Match object; span=(1710244, 1710252), match='>Item 4.'>
<re.Match object; span=(1713465, 1713472), match='PART II'>
<re.Match object; span=(1713903, 1713911), match='>Item 1.'>
<re.Match object; span=(1

In [10]:
# Matches
matches = regex.finditer(document['10-Q'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df

Unnamed: 0,item,start,end
0,>item&#160;1.,25769,25782
1,>item&#160;2.,26683,26696
2,>item&#160;3.,27645,27658
3,>item&#160;4.,28574,28587
4,>item&#160;1.,30123,30136
5,>item&#160;1a.,31011,31025
6,>item&#160;2.,31895,31908
7,>item&#160;3.,32825,32838
8,>item&#160;4.,33727,33740
9,part i,36487,36493


In [11]:
# Get rid of unnesesary charcters from the dataframe
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df

Unnamed: 0,item,start,end
0,item1,25769,25782
1,item2,26683,26696
2,item3,27645,27658
3,item4,28574,28587
4,item1,30123,30136
5,item1a,31011,31025
6,item2,31895,31908
7,item3,32825,32838
8,item4,33727,33740
9,parti,36487,36493


In [12]:
# Drop index label

for i in range(test_df.shape[0]):
    if test_df['item'][i] == 'parti':
        break

test_df = test_df.drop(index=[j for j in range(i)]).reset_index(drop=True)

# Rename the items
part = 0
for i in range(test_df.shape[0]):

    if test_df['item'][i] == 'parti':
        part = 1
    if test_df['item'][i] == 'partii':
        part = 2
    
    test_df.iloc[i, 0] = test_df['item'][i] + '_' + str(part)

# Drop part item
test_df.drop(index=test_df[test_df['item']=='parti_1'].index[0], inplace=True)
test_df.drop(index=test_df[test_df['item']=='partii_2'].index[0], inplace=True)

pos_dat = test_df.reset_index(drop=True)

pos_dat

Unnamed: 0,item,start,end
0,item1_1,36877,36885
1,item2_1,1271318,1271326
2,item3_1,1708652,1708660
3,item4_1,1710244,1710252
4,item1_2,1713903,1713911
5,item1a_2,1716620,1716629
6,item2_2,1788842,1788850
7,item3_2,1827721,1827729
8,item4_2,1828516,1828524


In [13]:
# Set item as the dataframe index
pos_dat.set_index('item', inplace=True)

# display the dataframe
pos_dat

Unnamed: 0_level_0,start,end
item,Unnamed: 1_level_1,Unnamed: 2_level_1
item1_1,36877,36885
item2_1,1271318,1271326
item3_1,1708652,1708660
item4_1,1710244,1710252
item1_2,1713903,1713911
item1a_2,1716620,1716629
item2_2,1788842,1788850
item3_2,1827721,1827729
item4_2,1828516,1828524


In [14]:
# Get Item 1_1
item_1_1_raw = document['10-Q'][pos_dat['start'].loc['item1_1']:pos_dat['start'].loc['item2_1']]

# Get Item 1_2
item_1_2_raw = document['10-Q'][pos_dat['start'].loc['item2_1']:pos_dat['start'].loc['item3_1']]

# Get Item 1_3
item_1_3_raw = document['10-Q'][pos_dat['start'].loc['item3_1']:pos_dat['start'].loc['item4_1']]

# Get Item 2_1A
item_2_1A_raw = document['10-Q'][pos_dat['start'].loc['item1a_2']:pos_dat['start'].loc['item2_2']]

In [30]:
### STEP 5 : Apply BeautifulSoup to refine the content
### First convert the raw text we have to exrtacted to BeautifulSoup object 
#item_1_1_content = BeautifulSoup(item_1_1_raw, 'lxml')
item_1_2_content = BeautifulSoup(item_1_2_raw, 'lxml')
item_1_3_content = BeautifulSoup(item_1_3_raw, 'lxml')
item_2_1A_content = BeautifulSoup(item_2_1A_raw, 'lxml')

In [31]:
# Delete tables from the content

for i in item_1_2_content.find_all('table'):
    i.decompose()

for i in item_1_3_content.find_all('table'):
    i.decompose()

for i in item_2_1A_content.find_all('table'):
    i.decompose()

In [35]:
item_1_2_list = [text for text in item_1_2_content.stripped_strings]
item_1_3_list = [text for text in item_1_3_content.stripped_strings]
item_2_1A_list = [text for text in item_2_1A_content.stripped_strings]

In [36]:
item_1_2_list = "".join(item_1_2_list).split('. ')
item_1_3_list = "".join(item_1_3_list).split('. ')
item_2_1A_list = "".join(item_2_1A_list).split('. ')