In [1]:
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm

In [12]:
# Root directory of your data
root_dir = '/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Files'

# Store extracted records
records = []

# Traverse all year folders with tqdm
for year in tqdm(os.listdir(root_dir), desc="Years"):
    year_path = os.path.join(root_dir, year)
    if not os.path.isdir(year_path):
        continue

    # Traverse quarter folders
    for quarter in os.listdir(year_path):
        quarter_path = os.path.join(year_path, quarter)
        if not os.path.isdir(quarter_path):
            continue

        # Traverse all files in quarter folder
        for filename in os.listdir(quarter_path):
            # Match only pure 10-K files (not 10-K/A or 10-K-A)
            if (
                '10-K' in filename.upper() and 
                '10-K/A' not in filename.upper() and 
                '10-K-A' not in filename.upper() and 
                filename.lower().endswith('.txt')
            ):
                try:
                    file_path = os.path.join(quarter_path, filename)

                    # Read file content
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        text = file.read()

                    # Split filename into parts
                    parts = filename.split('_')
                    filing_date = parts[0]
                    filing_type = parts[1]
                    cik = parts[3]
                    accession = parts[4].replace('.txt', '')

                    # Parse date
                    date_obj = datetime.strptime(filing_date, "%Y%m%d")
                    year_parsed = date_obj.year
                    month = date_obj.month
                    day = date_obj.day

                    # Append record
                    records.append({
                        'year_folder': year,
                        'quarter_folder': quarter,
                        'filing_date': filing_date,
                        'year': year_parsed,
                        'month': month,
                        'day': day,
                        'filing_type': filing_type,
                        'cik': cik,
                        'accession': accession,
                        'filename': filename,
                        'text': text
                    })

                except Exception as e:
                    print(f" Error reading {file_path}: {e}")

Years: 100%|██████████████████████████████████████| 5/5 [00:42<00:00,  8.43s/it]


In [13]:
df_10k_final = pd.DataFrame(records)

In [16]:
df_10k_final.columns

Index(['year_folder', 'quarter_folder', 'filing_date', 'year', 'month', 'day',
       'filing_type', 'cik', 'accession', 'filename', 'text'],
      dtype='object')

In [15]:
df_10k_final.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...


In [17]:
df_10k_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_folder     29302 non-null  object
 1   quarter_folder  29302 non-null  object
 2   filing_date     29302 non-null  object
 3   year            29302 non-null  int64 
 4   month           29302 non-null  int64 
 5   day             29302 non-null  int64 
 6   filing_type     29302 non-null  object
 7   cik             29302 non-null  object
 8   accession       29302 non-null  object
 9   filename        29302 non-null  object
 10  text            29302 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.5+ MB


In [20]:
df_10k_final.to_csv("10K_filings_all_years.csv", index=False)

In [19]:
pwd

'/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project'

In [2]:
import pandas as pd
df = pd.read_csv("10K_filings_all_years.csv")

In [3]:
import re

def clean_10k_text(text):
    if pd.isna(text):
        return ""

    # Remove HTML/XML tags (if any)
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove SEC header section if present
    text = re.sub(r'(?s)<SEC-Header>.*?</SEC-Header>', ' ', text)
    text = re.sub(r'(?s)<Header>.*?</Header>', ' ', text)

    # Remove file stats or metadata-like sections
    text = re.sub(r'(?i)file name.*?\.txt', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove duplicate underscores, dashes, and asterisks
    text = re.sub(r'[_*=-]{2,}', ' ', text)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    # Strip leading/trailing spaces
    return text.strip()

In [4]:
df["cleaned_text"] = df["text"].apply(clean_10k_text)

In [5]:
df.head(1000)

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022,QTR4,20221031,2022,10,31,10-K,data,923120,20221031_10-K_edgar_data_923120_0000950170-22-...,<Header>\n<FileStats>\n <FileName>20221031_...,20221031_10-K_edgar_data_923120_0000950170-22-...
996,2022,QTR4,20221114,2022,11,14,10-K,data,1493594,20221114_10-K_edgar_data_1493594_0001493594-22...,<Header>\n<FileStats>\n <FileName>20221114_...,20221114_10-K_edgar_data_1493594_0001493594-22...
997,2022,QTR4,20221004,2022,10,4,10-K,data,790652,20221004_10-K_edgar_data_790652_0001213900-22-...,<Header>\n<FileStats>\n <FileName>20221004_...,20221004_10-K_edgar_data_790652_0001213900-22-...
998,2022,QTR4,20221222,2022,12,22,10-K,data,1802974,20221222_10-K_edgar_data_1802974_0001802974-22...,<Header>\n<FileStats>\n <FileName>20221222_...,20221222_10-K_edgar_data_1802974_0001802974-22...


In [7]:
df.to_csv("10K_filings_cleaned.csv", index=False)

In [8]:
df_1000 = df.head(1000)

In [9]:
display(df_1000)

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022,QTR4,20221031,2022,10,31,10-K,data,923120,20221031_10-K_edgar_data_923120_0000950170-22-...,<Header>\n<FileStats>\n <FileName>20221031_...,20221031_10-K_edgar_data_923120_0000950170-22-...
996,2022,QTR4,20221114,2022,11,14,10-K,data,1493594,20221114_10-K_edgar_data_1493594_0001493594-22...,<Header>\n<FileStats>\n <FileName>20221114_...,20221114_10-K_edgar_data_1493594_0001493594-22...
997,2022,QTR4,20221004,2022,10,4,10-K,data,790652,20221004_10-K_edgar_data_790652_0001213900-22-...,<Header>\n<FileStats>\n <FileName>20221004_...,20221004_10-K_edgar_data_790652_0001213900-22-...
998,2022,QTR4,20221222,2022,12,22,10-K,data,1802974,20221222_10-K_edgar_data_1802974_0001802974-22...,<Header>\n<FileStats>\n <FileName>20221222_...,20221222_10-K_edgar_data_1802974_0001802974-22...


In [10]:
import re

def extract_item(text, item_number):
    """
    Extracts section like ITEM 1A, ITEM 7 from 10-K text
    """
    pattern = rf'(ITEM\s+{item_number}[^\n]*)(.*?)(ITEM\s+\d+[A-Z]?)'
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(2).strip()
    return None

In [11]:
df_1000["item_1a_risks"] = df_1000["cleaned_text"].apply(lambda x: extract_item(x, "1A"))
df_1000["item_7_mdna"]   = df_1000["cleaned_text"].apply(lambda x: extract_item(x, "7"))

KeyboardInterrupt: 

In [None]:
display(df_1000)

In [None]:
import pandas as pd
df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/10K_filings_cleaned.csv")