In [20]:
#import packages
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import qgrid
from urllib.request import urlretrieve
from collections import Counter

In [21]:
#declare url
url = "https://www.sec.gov/Archives/edgar/data/812011/0000927356-96-001229.txt"

In [22]:
#save file locally
urlretrieve(url, "MTN_10K")

('MTN_10K', <http.client.HTTPMessage at 0x1134d1208>)

In [23]:
#open and read
TenK = open("MTN_10K", mode="r").readlines()

In [24]:
#locate start and end points of financial statements within TenK
a = "(IN THOUSANDS"
b = "The accompanying notes to consolidated financial statements"

start_points = []
end_points = []

for count, line in enumerate(TenK):
    if a in line:
        (start_points.append(count))
    if b in line:
        (end_points.append(count))

print(start_points)
print(end_points)

[1449, 1517, 1593, 1656, 2212]
[1508, 1584, 1647, 1740]


In [25]:
#zip start_points and end_points into a dictionary, all_points
all_points = (dict(zip(start_points, end_points)))
print(all_points)

{1449: 1508, 1517: 1584, 1593: 1647, 1656: 1740}


In [26]:
#slice 10K by all_points and store resulting sub lists in a list, stmt_list

stmt_list = []

for key, value in all_points.items():
    x = key
    y = value
    stmt_list.append(TenK[x:y])

In [27]:
#Break out stmts from stmt_list
stmt1 = stmt_list[0]
stmt2 = stmt_list[1]
stmt3 = stmt_list[2]

In [28]:
#place a single delimiter to mark each column
stmt1 = [re.sub(r"(?:<C>)", r"|", line, count = 1) for line in stmt1]
stmt1 = [re.sub(r"(?:<C>)", r"||", line) for line in stmt1]

In [29]:
#determine the number of characters that preceed each delimiter for stmt1
locators = []

for line in stmt1:
    if "|" in line:
        locator1 = line.partition("|")
        locator1 = len(locator1[0])
        locators.append(locator1)
    if "||" in line:
        locator2 = line.partition("||")
        locator2 = len(locator2[0])
        locators.append(locator2)
        
locators = Counter(locators).most_common(2)

locator1 = locators[0][0]
locator2 = locators[1][0]
print(locator1)
print(locator2)

52
64


In [11]:
#Convert the locators to regex in order to fill in the remaining delimiters from stmts
regex_locator1 = "(." + "{" + "{}".format(locator1) + "}"+ ")"
regex_locator2 = "(." + "{" + "{}".format(locator2) + "}"+ ")"

print(regex_locator1)
print(regex_locator2)

(.{52})
(.{64})


In [12]:
#add the delimters to each statement using the regex locators 
stmt1 = "".join(stmt1)

stmt1 = re.sub(regex_locator1, r"\1|", stmt1)
stmt1 = re.sub(regex_locator2, r"\1|", stmt1)
stmt1 = stmt1.replace("| |", "|").replace("||" ,"|")

In [13]:
print(stmt1)

               (IN THOUSANDS, EXCEPT SHARE AND PER S|HARE AMOUNT|S)
 
<TABLE>
<CAPTION>
                                                    |SEPTEMBER 3|0, SEPTEMBER 30,
                                                    |    1995   |       1996
                                                    |-----------|-- -------------
<S>                                                 |          |
                      ASSETS
Current assets:
  Cash and cash equivalents........................ |  $ 47,534 |     $ 12,712
  Receivables...................................... |     5,135 |        5,741
  Inventories...................................... |     4,221 |        4,639
  Deferred income taxes (Note 8)................... |     9,500 |       17,200
  Other current assets............................. |     3,716 |        5,490
                                                    |  -------- |     --------
    Total current assets........................... |    70,106 |       45,782
Property

In [14]:
#write the contents of each statement its own file
with open("TenK_stmt1", mode="w+") as TenK_stmt1:
    TenK_stmt1.write(stmt1)
    TenK_stmt1.close()

In [15]:
#read file into a pandas dataframe and clean
df1 = pd.read_csv("TenK_stmt1", sep="|", header=None, error_bad_lines=0)
df1 = df1.replace("\.|\-|=|<|>|,|\(\D\D\D\D.+|\d\)", "", regex = True).replace("TABLE", "").replace("CAPTION", "").replace("/TABLE","")