In [2]:
import pandas as pd
import os
import openpyxl
import numpy as np
import re
from datetime import datetime
import requests

In [171]:
cik = "320193"
edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{cik:0>10}.json", headers=headers).json()
filings = pd.DataFrame(edgar_filings["filings"]["recent"])
filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True)
filings.head(5)

Unnamed: 0,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,form,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription
0,0000320193-22-000059,2022-04-29,2022-03-26,2022-04-28T18:03:58.000Z,34,10-Q,001-36743,22868650,,6140838,1,1,aapl-20220326.htm,10-Q
1,0000320193-22-000007,2022-01-28,2021-12-25,2022-01-27T18:00:58.000Z,34,10-Q,001-36743,22564628,,5669748,1,1,aapl-20211225.htm,10-Q
2,0000320193-21-000105,2021-10-29,2021-09-25,2021-10-28T18:04:28.000Z,34,10-K,001-36743,211359752,,10502096,1,1,aapl-20210925.htm,10-K
3,0000320193-21-000065,2021-07-28,2021-06-26,2021-07-27T18:03:42.000Z,34,10-Q,001-36743,211119137,,8446381,1,1,aapl-20210626.htm,10-Q
4,0000320193-21-000056,2021-04-29,2021-03-27,2021-04-28T18:02:54.000Z,34,10-Q,001-36743,21866148,,8468959,1,1,aapl-20210327.htm,10-Q


In [170]:


url = "https://www.sec.gov/Archives/edgar/data/320193/000032019322000059/aapl-20220326.htm"
req = requests.get(url,headers=headers)

#req.text




In [6]:
# from bs4 import BeautifulSoup

# soup = BeautifulSoup(req.content, 'html.parser')
# soup.prettify()

In [17]:
req = requests.get("https://www.sec.gov/Archives/edgar/data/320193/000032019322000059/Financial_Report.xlsx",headers=headers)
f = open("file.xlsx","wb")
f.write(req.content)
# pd.read_excel("file",sheet_name=None)


63577

In [177]:
class scrapeSECData:
    def __init__(self,ticker):
        self.ticker = ticker 
        self.CIK = self.getCIK()
        self.filings, self.fiscalYear = self.getSECFilings()
        self.writeFilings()
        
    def getCIK(self): #get SEC code for company based on ticker
        headers={"User-Agent": "Mozilla/5.0"}
        symbol_to_cik = requests.get("https://www.sec.gov/files/company_tickers.json").json() #returns a json dictionary with a indexed list of all different companies 
        ciks = {info["ticker"]:info["cik_str"] for key,info in symbol_to_cik.items()} #create dictionary indexable by ticker
        return ciks[self.ticker]
    
        
    
    def getSECFilings(self):
        edgar_filings = requests.get(f"https://data.sec.gov/submissions/CIK{self.CIK:0>10}.json", headers=headers).json()
        filings = pd.DataFrame(edgar_filings["filings"]["recent"])
        
        #get 10Q and 10k filings
        filings = filings.loc[filings["form"].isin(["10-Q","10-K"])].reset_index(drop=True) #drop all filings that are not 10Q or 10K
        filings["Month"] = pd.to_datetime(filings["filingDate"]).dt.strftime("%m").astype("int") #get Month of all filings 
        filings["Year"] = pd.to_datetime(filings["filingDate"]).dt.strftime("%Y").astype("int") #get Year of all filings
        

        #get a fiscal Year of data by finding the last 10K and the three 10Qs before it
        TenKIndex = filings.loc[filings["form"]=="10-K"].index[0] 
        fiscalYear = filings.loc[TenKIndex:TenKIndex+3,["Month","form"]].copy()
        fiscalYear["Period"] = ["Year Ended", "Q3","Q2","Q1"]

        #merge fiscal year labels with filings by the month that they were reported
        filings = pd.merge(fiscalYear,filings,on=["Month","form"]).sort_values(by="reportDate",ascending=False)
        
        
        #set Fiscal Year
        filings["Fiscal Year"] = filings["Year"]
        #get Quarters that are a part of different fiscal Year than report date
        TenKMo = fiscalYear.loc[TenKIndex]["Month"]
        mos = fiscalYear.loc[fiscalYear["Month"]>TenKMo]["Month"] #the 10K is reported in a month before the 10Q so therefore the 10Q is the next fiscal year 
        filings.loc[filings["Month"].isin(mos),"Fiscal Year"] += 1
        
        
        return filings, fiscalYear
        
        
    def writeFilings(self):
        folder = os.path.join("input",f"Financial Statement {self.ticker}")
        if f"Financial Statement {self.ticker}" not in os.listdir("input"):
            os.mkdir(folder)
        
        for i,row in self.filings.iterrows():
            period = row["Period"]
            fiscalYear = row["Fiscal Year"]
            Year = row["Year"]
            accessionNum = row["accessionNumber"].replace("-","")
            
            url = f"https://www.sec.gov/Archives/edgar/data/{self.CIK}/{accessionNum}/Financial_Report.xlsx"
            
            req = requests.get(url,headers=headers)
            if fiscalYear != Year:
                name = os.path.join(folder,f"{period} {fiscalYear} ({Year}).xlsx")
            else: 
                name = os.path.join(folder,f"{period} {fiscalYear}.xlsx")
            print(name, url)
            file = open(name,"wb")
            file.write(req.content)
    
        

In [178]:
d = scrapeSECData("GOOGL")

input/Financial Statement GOOGL/Q2 2023 (2022).xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204422000071/Financial_Report.xlsx
input/Financial Statement GOOGL/Q1 2023 (2022).xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204422000029/Financial_Report.xlsx
input/Financial Statement GOOGL/Year Ended 2022.xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204422000019/Financial_Report.xlsx
input/Financial Statement GOOGL/Q3 2022 (2021).xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204421000057/Financial_Report.xlsx
input/Financial Statement GOOGL/Q2 2022 (2021).xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204421000047/Financial_Report.xlsx
input/Financial Statement GOOGL/Q1 2022 (2021).xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204421000020/Financial_Report.xlsx
input/Financial Statement GOOGL/Year Ended 2021.xlsx https://www.sec.gov/Archives/edgar/data/1652044/000165204421000010/Financial_Report.xlsx
input/Finan

In [167]:

d.filings

Unnamed: 0,Month,form,Period,accessionNumber,filingDate,reportDate,acceptanceDateTime,act,fileNumber,filmNumber,items,size,isXBRL,isInlineXBRL,primaryDocument,primaryDocDescription,Year,Fiscal Year
10,7,10-Q,Q2,0001652044-22-000071,2022-07-27,2022-06-30,2022-07-26T19:29:36.000Z,34,001-37580,221108491,,11801686,1,1,goog-20220630.htm,10-Q,2022,2023
16,4,10-Q,Q1,0001652044-22-000029,2022-04-27,2022-03-31,2022-04-26T20:51:20.000Z,34,001-37580,22855985,,10159750,1,1,goog-20220331.htm,10-Q,2022,2023
0,2,10-K,Year Ended,0001652044-22-000019,2022-02-02,2021-12-31,2022-02-01T21:08:02.000Z,34,001-37580,22581247,,15044932,1,1,goog-20211231.htm,10-K,2022,2022
5,10,10-Q,Q3,0001652044-21-000057,2021-10-27,2021-09-30,2021-10-26T19:47:06.000Z,34,001-37580,211350220,,11760233,1,1,goog-20210930.htm,10-Q,2021,2022
11,7,10-Q,Q2,0001652044-21-000047,2021-07-28,2021-06-30,2021-07-27T20:16:30.000Z,34,001-37580,211119442,,11969070,1,1,goog-20210630.htm,10-Q,2021,2022
17,4,10-Q,Q1,0001652044-21-000020,2021-04-28,2021-03-31,2021-04-27T19:17:57.000Z,34,001-37580,21860864,,10018519,1,1,goog-20210331.htm,10-Q,2021,2022
1,2,10-K,Year Ended,0001652044-21-000010,2021-02-03,2020-12-31,2021-02-02T20:12:25.000Z,34,001-37580,21583716,,14948616,1,1,goog-20201231.htm,10-K,2021,2021
6,10,10-Q,Q3,0001652044-20-000050,2020-10-30,2020-09-30,2020-10-29T19:30:04.000Z,34,001-37580,201274199,,12756554,1,1,goog-20200930.htm,10-Q,2020,2021
12,7,10-Q,Q2,0001652044-20-000032,2020-07-31,2020-06-30,2020-07-30T19:57:55.000Z,34,001-37580,201062550,,15757485,1,1,goog-20200630.htm,10-Q,2020,2021
18,4,10-Q,Q1,0001652044-20-000021,2020-04-29,2020-03-31,2020-04-28T18:52:33.000Z,34,001-37580,20826733,,12641678,1,1,goog-20200331.htm,10-Q,2020,2021
