In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 17 14:35:27 2019

@author: Irene
"""
import csv
import xml
import xml.etree.ElementTree as ET
from datetime import datetime
import logging
logger = logging.getLogger(__name__)

from sec_edgar_downloader import Downloader
import pandas as pd
import os
import random 
import requests
import pandas as pd

In [2]:
# Initialize a downloader instance.
dl = Downloader("/Users/Irene/Desktop/M/Irene/")

# Three investors:
investors = {'BG':'0001061768','LPC':'0001061165','VG':'0001103804'}

# Get all the latest 2 13-f-hr filings
for investor, CIK in investors.items():
    investor = dl.get_13f_hr_filings(CIK)

In [4]:
# Extract data from EDGAR text files
# Source files from https://github.com/cpackard/fundholdings

class Holding():
    def __init__(self, cik, date, name, cusip, value, share):
        self.cik= cik
        self.date = date
        self.name = name
        self.cusip = cusip
        self.value  = value
        self.share = share

def generate_report(folder, cik,accepted_date,holdings):
    """
    Given the current reports' list of holdings, generate a tab-delimited
    report of the holdings.
    """
    reportname = (folder+'/'+cik + '_' + str(accepted_date) + '.csv')
    print('reportname: ', reportname)

    with open(reportname, 'w') as csvfile:
        fields = ['Name', 'CUSIP', 'Market value','Share']
        writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter=',')

        writer.writeheader()
        for holding in holdings:
            writer.writerow({'Name': holding.name,
                             'CUSIP': holding.cusip,
                             'Market value':  holding.value,
                             'Share':holding.share})
            
    return [reportname]

def _short_tag(tag):
    """Helper method to remove any namespaces from the XML tag"""
    return tag[tag.rfind('}')+1:len(tag)]

def _tags_and_vals(root):
    """
    Helper method to recursively search through an XML element,
    returning a dict of tag name --> tag text
    """
    result = {}

    for child in root:
        tag = _short_tag(child.tag)

        if list(child):
            result[tag] = _tags_and_vals(child)
        else:
            result[tag] = child.text

    return result

def get_infotables(root):
    """
    Given the root XML element informationTable, search through all infoTable
    elements and return a list of dicts mapping tag name --> tag text
    """
    result = []

    for child in root:
        tag = _short_tag(child.tag)

        if tag == 'infoTable':
            result.append(_tags_and_vals(child))

    return result
def get_13f_xml(holdings_statement):
    """
    Given the complete submission text for a 13F-HR filing,
    parse and return only the XML containing the information table.
    """
    holdings_xml    = []
    accepted_date   = ''
    submission_type = ''
    info_started    = False

    for line in holdings_statement.split('\n'):
        # Parse only the lines between the <informationTable> tags
        if info_started:
            if '</XML>' in line:
                break
            else:
                holdings_xml.append(line)
        elif 'informationTable' in line:
            info_started = True
            holdings_xml.append(line)
        elif 'CONFORMED SUBMISSION TYPE' in line:
            submission_type = line[line.find(':')+1:].strip()

        elif 'CONFORMED PERIOD OF REPORT' in line:
            accepted_date = line[line.find(':')+1:].strip()

    return accepted_date, submission_type, ''.join(holdings_xml)

def get_13f_holdings(cik, accepted_date, submission_type, holdings_xml):
    """
    Given a well-formed xml containing the holding data from a 13F-HR filing,
    parse the xml and return a 13FHR object containing a list of holdings DTO objects.
    """
    try:
        tree = ET.fromstring(holdings_xml)
    except xml.etree.ElementTree.ParseError:
        # TODO Change this to actual logging
        print('get_13f_holdings expected a well-formed xml but ParseError occured')
        return 0,0,0,0
        raise

    infotables = get_infotables(tree)
    holdings = [Holding(cik,accepted_date, h['nameOfIssuer'],h['cusip'],
                            h['value'],h['shrsOrPrnAmt']['sshPrnamt'],)
                  for h in infotables]

    return cik, accepted_date, submission_type, holdings

# Below is to generate report whose features come as ['CIK','DATE', 'Name', 'CUSIP', 'Market value','Share']

num =["1061165","1061768",'1103804']
file = ['/Users/Irene/Desktop/M/sec_edgar_filings/%s/13F-HR/'%i for i in num]
result=['/Users/Irene/Desktop/M/sec_edgar_filings/%s/result'%i for i in num]
reportname = '/Users/Irene/Desktop/M/result.csv'
holdings_lst=[]

for i in range(len(num)):
    folder = file[i]
    result_folder=result[i]
    for j in os.listdir(folder):
        cik=num[i]
        path = os.path.join(folder,j)
        with open(path) as f:
            holdings_content = f.read()
        accepted_date, submission_type, holdings_xml=get_13f_xml(holdings_content)
        cik, accepted_date, submission_type, holdings=get_13f_holdings(cik, accepted_date, 
                                                                       submission_type, holdings_xml)
        if cik==0:
            continue
        print(path)
        holdings_lst.extend(holdings)

with open(reportname, 'w') as csvfile:
    fields = ['CIK','DATE', 'Name', 'CUSIP', 'Market value','Share']
    writer = csv.DictWriter(csvfile, fieldnames=fields, delimiter=',')

    writer.writeheader()
    for holding in holdings_lst:
        writer.writerow({'CIK':holding.cik,
                         'DATE':holding.date,
                         'Name': holding.name,
                         'CUSIP': holding.cusip,
                         'Market value':  holding.value,
                        'Share':holding.share})

/Users/Irene/Desktop/M/sec_edgar_filings/1061165/13F-HR/0001567619-19-011247.txt
/Users/Irene/Desktop/M/sec_edgar_filings/1061768/13F-HR/0001567619-19-011239.txt
/Users/Irene/Desktop/M/sec_edgar_filings/1103804/13F-HR/0001103804-19-000006.txt


In [5]:
# This block is to find the number of entries of combining two reports

reportname = '/Users/Irene/Desktop/M/result.csv'

file = pd.read_csv(reportname)
df = pd.DataFrame(file)
print(len(df['Name']))

a= 1061165
b= 1061768
c= 1103804
print(a,b,c)
ab=df[ (df['CIK']==a) | (df['CIK']==b)]
bc=df[ (df['CIK']==b) | (df['CIK']==c)]
ac=df[ (df['CIK']==a) | (df['CIK']==c)]
print(len(set(ab['Name'])))
print(len(set(bc['Name'])))
print(len(set(ac['Name'])))

121
1061165 1061768 1103804
61
87
76
