In [None]:
import requests

# Define the date range
start_date = 2020  # year 2020 - 2023
end_date = 2020
quarters = 1 # 4 quarters
user_agent = {"User-agent": "Mozilla/5.0"}

# Create a function to format and space the data
def format_data(cik, company_name, form_type, date_filed, file_name):
    formatted_cik = f"{cik:^7}"
    formatted_company_name = f"{company_name[:30]:^30}"
    formatted_form_type = f"{form_type[:10]:^10}"
    formatted_date_filed = f"{date_filed:^10}"
    formatted_file_name = f"{file_name:<40}"

    formatted_line = f"{formatted_cik}|{formatted_company_name}|{formatted_form_type}|{formatted_date_filed}|{formatted_file_name}"

    return formatted_line

# Download and process the master.idx file for each year and quarter
with open("formatted-master.txt", "wb") as f:
    for year in range(start_date, end_date + 1):
        for quarter in range(1, quarters + 1):
            print(year, quarter)
            try:
                content = requests.get(
                    f"https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/master.idx",
                    headers=user_agent,
                ).content

                # Write the formatted content to the master.idx file
                lines = content.decode("utf-8").splitlines()
                formatted_lines = []
                for line in lines:
                    if line.startswith("----"):
                        formatted_lines.append(line)
                    else:
                        data = line.split("|")
                        if len(data) == 5:
                            formatted_line = format_data(*data)
                            formatted_lines.append(formatted_line)

                formatted_content = "\n".join(formatted_lines)
                f.write(formatted_content.encode("utf-8"))

            except Exception as e:
                print(f"An error occurred: {str(e)}")

In [None]:
import json
import sqlite3
from Definitions import doGet
from Dummy_Data import ciks

# Create database
conn = sqlite3.connect("Database.db")
cursor = conn.cursor()

def simplify_json(data, remove_filings=True):
    if remove_filings:
        dict_no_filings = data.copy()
        dict_no_filings.pop("filings", None)
        for key, value in dict_no_filings.items():
            if not isinstance(value, str):
                dict_no_filings[key] = json.dumps(value)
        return dict_no_filings
    
    else:
        recent_dict = data["filings"]["recent"].copy()
        return recent_dict

def create_company_info_table_query(data):
    columns = [f"{key} TEXT" for key in data.keys() if key != "cik"]
    query = f"CREATE TABLE IF NOT EXISTS CompanyInformation (cik TEXT PRIMARY KEY, {', '.join(columns)})"
    return query

def insert_into_company_info_table_query(data):
    placeholders = ', '.join(['?'] * len(data))
    query = f"INSERT OR REPLACE INTO CompanyInformation ({', '.join(data.keys())}) VALUES ({placeholders})"
    return query

def create_company_filings_table_query(data):
    columns = [f"{key} TEXT" for key in data.keys() if key not in ["cik", "accessionNumber"]]
    query = f"CREATE TABLE IF NOT EXISTS CompanyFilings (accessionNumber TEXT PRIMARY KEY, {', '.join(columns)}, filepath TEXT, cik TEXT, FOREIGN KEY (cik) REFERENCES CompanyInformation(cik))"
    return query

def insert_into_company_filings_table_query(data):
    placeholders = ', '.join(['?'] * (len(data) + 2))
    query = f"INSERT OR REPLACE INTO CompanyFilings ({', '.join(data.keys())}, filepath, cik) VALUES ({placeholders})"
    return query

#cik_array = doGet('https://www.sec.gov/Archives/edgar/cik-lookup-data.txt').content
#ciks = set(cik_array.decode('utf-8', 'ignore').split(":")[1::2])

company_info_table_created = False
company_filings_table_created = False
for cik in ciks:
    print(f"Currently working on CIK {cik}")
    company_data = doGet(f"https://data.sec.gov/submissions/CIK{cik}.json").content.decode("utf-8")

    if company_data[0] == "{":
        company_data = json.loads(company_data)
        
        # Company Table
        company_information_no_filings = simplify_json(company_data, remove_filings=True)
    
        if not company_info_table_created:
            create_company_info_table_sql = create_company_info_table_query(company_information_no_filings)
            print(create_company_info_table_sql)
            cursor.execute(create_company_info_table_sql)
            
            company_info_table_created = True
        
        insert_into_company_info_table_sql = insert_into_company_info_table_query(company_information_no_filings)
        company_information_values = list(company_information_no_filings.values())
        print(insert_into_company_info_table_sql, company_information_values)
        cursor.execute(insert_into_company_info_table_sql, company_information_values)
        
        # Filings Table
        company_information_filings = simplify_json(company_data, remove_filings=False)
        
        if not company_filings_table_created:
            create_company_filings_table_sql = create_company_filings_table_query(company_information_filings)
            print(create_company_filings_table_sql)
            cursor.execute(create_company_filings_table_sql)
            
            company_info_table_created = True
        
        
        insert_into_company_filings_table_sql = insert_into_company_filings_table_query(company_information_filings)
        all_filing_forms = []
        for index in range(len(company_information_filings["accessionNumber"])):
            company_filings_values = []
            form = company_information_filings["form"][index]
            if form not in all_filing_forms:
                all_filing_forms.append(form) 
            for key in company_information_filings.keys():
                if key == "accessionNumber":
                    accession_number = company_information_filings["accessionNumber"][index]
                    accession_number_no_dash = accession_number.replace("-", "")
                    company_filings_values.append(accession_number)
                else:
                    company_filings_values.append(str(company_information_filings[key][index]))
            
            filepath = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_number_no_dash}/{accession_number}.txt"
            company_filings_values.append(filepath)
            company_filings_values.append(cik)
            
            print(insert_into_company_filings_table_sql, company_filings_values)
            cursor.execute(insert_into_company_filings_table_sql, company_filings_values)
        print()
        conn.commit()
    
    else:
        print("Invalid CIK")

cursor.execute('''SELECT * FROM CompanyFilings''')

form_coloumn = []
all_form_forms = []

for row in cursor.fetchall():
    form_coloumn.append(row[5])

for form in form_coloumn:
    if form not in all_form_forms:
        all_form_forms.append(form)

for form in all_form_forms:
    sql_call =  f'''CREATE VIEW IF NOT EXISTS view{form.replace("-", "_").replace(" ", "_").replace("/", "_")} AS
            SELECT * FROM CompanyFilings
            WHERE form = "{form}"'''
    cursor.execute(sql_call)

conn.close()