In [6]:
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import re

In [12]:
# Root directory of your data
root_dir = '/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Files'

# Store extracted records
records = []

# Traverse all year folders with tqdm
for year in tqdm(os.listdir(root_dir), desc="Years"):
    year_path = os.path.join(root_dir, year)
    if not os.path.isdir(year_path):
        continue

    # Traverse quarter folders
    for quarter in os.listdir(year_path):
        quarter_path = os.path.join(year_path, quarter)
        if not os.path.isdir(quarter_path):
            continue

        # Traverse all files in quarter folder
        for filename in os.listdir(quarter_path):
            # Match only pure 10-K files (not 10-K/A or 10-K-A)
            if (
                '10-K' in filename.upper() and 
                '10-K/A' not in filename.upper() and 
                '10-K-A' not in filename.upper() and 
                filename.lower().endswith('.txt')
            ):
                try:
                    file_path = os.path.join(quarter_path, filename)

                    # Read file content
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                        text = file.read()

                    # Split filename into parts
                    parts = filename.split('_')
                    filing_date = parts[0]
                    filing_type = parts[1]
                    cik = parts[3]
                    accession = parts[4].replace('.txt', '')

                    # Parse date
                    date_obj = datetime.strptime(filing_date, "%Y%m%d")
                    year_parsed = date_obj.year
                    month = date_obj.month
                    day = date_obj.day

                    # Append record
                    records.append({
                        'year_folder': year,
                        'quarter_folder': quarter,
                        'filing_date': filing_date,
                        'year': year_parsed,
                        'month': month,
                        'day': day,
                        'filing_type': filing_type,
                        'cik': cik,
                        'accession': accession,
                        'filename': filename,
                        'text': text
                    })

                except Exception as e:
                    print(f" Error reading {file_path}: {e}")

Years: 100%|██████████████████████████████████████| 5/5 [00:42<00:00,  8.43s/it]


In [13]:
df_10k_final = pd.DataFrame(records)

In [16]:
df_10k_final.columns

Index(['year_folder', 'quarter_folder', 'filing_date', 'year', 'month', 'day',
       'filing_type', 'cik', 'accession', 'filename', 'text'],
      dtype='object')

In [15]:
df_10k_final.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...


In [17]:
df_10k_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_folder     29302 non-null  object
 1   quarter_folder  29302 non-null  object
 2   filing_date     29302 non-null  object
 3   year            29302 non-null  int64 
 4   month           29302 non-null  int64 
 5   day             29302 non-null  int64 
 6   filing_type     29302 non-null  object
 7   cik             29302 non-null  object
 8   accession       29302 non-null  object
 9   filename        29302 non-null  object
 10  text            29302 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.5+ MB


In [20]:
df_10k_final.to_csv("10K_filings_all_years.csv", index=False)

In [19]:
pwd

'/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project'

In [3]:
import re

def clean_10k_text(text):
    if pd.isna(text):
        return ""

    # Remove HTML/XML tags (if any)
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove SEC header section if present
    text = re.sub(r'(?s)<SEC-Header>.*?</SEC-Header>', ' ', text)
    text = re.sub(r'(?s)<Header>.*?</Header>', ' ', text)

    # Remove file stats or metadata-like sections
    text = re.sub(r'(?i)file name.*?\.txt', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove duplicate underscores, dashes, and asterisks
    text = re.sub(r'[_*=-]{2,}', ' ', text)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    # Strip leading/trailing spaces
    return text.strip()

In [7]:
df["cleaned_text"] = df["text"].apply(clean_10k_text)  #11mins

In [None]:
#df.to_csv("10K_filings_cleaned.csv", index=False)

In [None]:
df_1000 = df.head(1000)

In [None]:
display(df_1000)

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022,QTR4,20221031,2022,10,31,10-K,data,923120,20221031_10-K_edgar_data_923120_0000950170-22-...,<Header>\n<FileStats>\n <FileName>20221031_...,20221031_10-K_edgar_data_923120_0000950170-22-...
996,2022,QTR4,20221114,2022,11,14,10-K,data,1493594,20221114_10-K_edgar_data_1493594_0001493594-22...,<Header>\n<FileStats>\n <FileName>20221114_...,20221114_10-K_edgar_data_1493594_0001493594-22...
997,2022,QTR4,20221004,2022,10,4,10-K,data,790652,20221004_10-K_edgar_data_790652_0001213900-22-...,<Header>\n<FileStats>\n <FileName>20221004_...,20221004_10-K_edgar_data_790652_0001213900-22-...
998,2022,QTR4,20221222,2022,12,22,10-K,data,1802974,20221222_10-K_edgar_data_1802974_0001802974-22...,<Header>\n<FileStats>\n <FileName>20221222_...,20221222_10-K_edgar_data_1802974_0001802974-22...


In [7]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
import re

def extract_item(text, item_number):
    """
    Extracts section like ITEM 1A, ITEM 7 from 10-K text
    """
    pattern = rf'(ITEM\s+{item_number}[^\n]*)(.*?)(ITEM\s+\d+[A-Z]?)'
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(2).strip()
    return None

In [8]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [23]:
pip install torch torchvision torchaudio

18283.26s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install torch torchvision torchaudio

In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [13]:
from transformers import pipeline

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [10]:
from collections import Counter

def analyze_sentiment(text):
    chunks = chunk_text(text)
    labels = []

    for chunk in chunks:
        result = finbert(chunk[:512])[0]  # Truncate to 512 tokens
        labels.append(result["label"])

    # Aggregate: count the most common sentiment
    count = Counter(labels)
    dominant_sentiment = count.most_common(1)[0][0]
    
    return dominant_sentiment, dict(count)

In [11]:
df_1000 = df.head(1000)

In [14]:
# Assuming df_1000 contains a 'cleaned_text' column
df_1000["sentiment_result"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[0])
df_1000["sentiment_breakdown"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1000["sentiment_result"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1000["sentiment_breakdown"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[1])


In [16]:
df_1000.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,sentiment_result,sentiment_breakdown
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,neutral,"{'neutral': 72, 'positive': 11, 'negative': 16}"
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,neutral,"{'neutral': 27, 'positive': 2, 'negative': 3}"
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,neutral,"{'neutral': 101, 'positive': 4, 'negative': 9}"
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,neutral,"{'neutral': 144, 'positive': 13, 'negative': 14}"
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,neutral,"{'neutral': 102, 'positive': 6, 'negative': 9}"


In [17]:
df_1000.to_csv("10K_with_sentiment.csv", index=False)
#df_1000[["file_name", "sentiment_result", "sentiment_breakdown"]].head()

## Using LM lexicon

In [1]:
import pandas as pd
df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/10K_filings_all_years.csv")  #5m 33sec

In [4]:
df.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...


In [3]:
def basic_cleaner(text):
    if pd.isna(text):
        return ""

    # Remove HTML/XML tags
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove SEC headers
    text = re.sub(r'(?s)<SEC-Header>.*?</SEC-Header>', ' ', text)
    text = re.sub(r'(?s)<Header>.*?</Header>', ' ', text)

    # Remove file meta like "file name xyz.txt"
    text = re.sub(r'(?i)file name.*?\.txt', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove excessive punctuation and underscores
    text = re.sub(r'[_*=-]{2,}', ' ', text)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

In [None]:

# Apply initial cleaning
df["cleaned_text"] = df["text"].apply(basic_cleaner) #10min 32.4

In [10]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Download stopwords
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nareshchethala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Final preprocessing for LM-based sentiment
def clean_for_lm_modeling(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = text.encode("ascii", "ignore").decode()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", " ", text)
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

# Apply to cleaned_text
df["normalized_text"] = df["cleaned_text"].apply(clean_for_lm_modeling)

In [13]:
df.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...


In [14]:
df_1000 = df.head(1000).copy()

In [15]:
df_1000.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...


In [16]:
lm_df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Loughran-McDonald_MasterDictionary_1993-2024.csv")

In [18]:
# Load dictionary and lowercase words
lm_df["Word"] = lm_df["Word"].str.lower()

In [19]:
display(lm_df.head(10))

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,aardvark,1,755,2.95507e-08,1.945421e-08,4.078069e-06,140,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.1742e-10,8.060019e-12,8.919011e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,3.5226e-10,1.089343e-10,5.105359e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,29,1.13506e-09,6.197922e-10,1.539279e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,9620,3.765268e-07,3.825261e-07,3.421836e-05,1295,0,0,0,0,0,0,0,0,3,12of12inf
5,abacuses,6,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,4,12of12inf
6,abaft,7,4,1.5656e-10,2.144787e-11,2.373367e-08,1,0,0,0,0,0,0,0,0,2,12of12inf
7,abalone,8,149,5.83186e-09,4.729504e-09,1.031859e-06,52,0,0,0,0,0,0,0,0,4,12of12inf
8,abalones,9,1,3.914e-11,7.715206e-11,8.537449e-08,1,0,0,0,0,0,0,0,0,4,12of12inf
9,abandon,10,154158,6.033745e-06,4.824004e-06,3.261271e-05,76324,2009,0,0,0,0,0,0,0,3,12of12inf


In [None]:
# Mapping column names to simplified category names
category_columns = {
    "positive": "Positive",
    "negative": "Negative",
    "uncertainty": "Uncertainty",
    "litigious": "Litigious",
    "strong_modal": "Strong_Modal",
    "weak_modal": "Weak_Modal",
    "constraining": "Constraining"
}

# Create a dictionary of category sets
lm_lexicons = {
    category: set(lm_df[lm_df[col] > 0]["Word"].str.lower())
    for category, col in category_columns.items()
}

In [21]:
import re
from collections import defaultdict

def lm_sentiment_all_classes(text):
    words = re.findall(r'\b\w+\b', text.lower())
    counts = defaultdict(int)

    for word in words:
        for category, word_set in lm_lexicons.items():
            if word in word_set:
                counts[category] += 1

    return dict(counts)

In [22]:
df_1000["lm_sentiment_breakdown"] = df_1000["normalized_text"].apply(lm_sentiment_all_classes)

In [23]:
df_1000.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text,lm_sentiment_breakdown
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'litigious': 567, 'constraining': 288, 'uncer..."
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'litigious': 126, 'constraining': 55, 'negati..."
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'litigious': 578, 'constraining': 443, 'uncer..."
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'litigious': 1775, 'constraining': 584, 'unce..."
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'litigious': 937, 'constraining': 344, 'negat..."


In [26]:
df_1000[["filename", "lm_sentiment_breakdown"]].head(10)

Unnamed: 0,filename,lm_sentiment_breakdown
0,20220520_10-K_edgar_data_849399_0000849399-22-...,"{'litigious': 567, 'constraining': 288, 'uncer..."
1,20220613_10-K_edgar_data_1857910_0001599916-22...,"{'litigious': 126, 'constraining': 55, 'negati..."
2,20220414_10-K_edgar_data_1855751_0001493152-22...,"{'litigious': 578, 'constraining': 443, 'uncer..."
3,20220404_10-K_edgar_data_748790_0001575872-22-...,"{'litigious': 1775, 'constraining': 584, 'unce..."
4,20220408_10-K_edgar_data_1512927_0001410578-22...,"{'litigious': 937, 'constraining': 344, 'negat..."
5,20220401_10-K_edgar_data_1511820_0001493152-22...,"{'litigious': 148, 'constraining': 80, 'negati..."
6,20220429_10-K_edgar_data_1676047_0001213900-22...,"{'litigious': 483, 'constraining': 348, 'negat..."
7,20220414_10-K_edgar_data_1637866_0001493152-22...,"{'litigious': 501, 'negative': 633, 'constrain..."
8,20220413_10-K_edgar_data_1868269_0001104659-22...,"{'litigious': 927, 'constraining': 649, 'uncer..."
9,20220412_10-K_edgar_data_1652958_0001683168-22...,"{'litigious': 337, 'constraining': 167, 'uncer..."


In [27]:
df_1000.to_csv("10K_with_lm_sentiment_1000.csv", index=False)

In [28]:
def dominant_lm_category(counts_dict):
    if not counts_dict:
        return "NEUTRAL"
    return max(counts_dict.items(), key=lambda x: x[1])[0].upper()

df_1000["lm_dominant_sentiment"] = df_1000["lm_sentiment_breakdown"].apply(dominant_lm_category)

In [29]:
df_1000[["filename", "lm_dominant_sentiment"]].head(10)

Unnamed: 0,filename,lm_dominant_sentiment
0,20220520_10-K_edgar_data_849399_0000849399-22-...,NEGATIVE
1,20220613_10-K_edgar_data_1857910_0001599916-22...,NEGATIVE
2,20220414_10-K_edgar_data_1855751_0001493152-22...,UNCERTAINTY
3,20220404_10-K_edgar_data_748790_0001575872-22-...,LITIGIOUS
4,20220408_10-K_edgar_data_1512927_0001410578-22...,LITIGIOUS
5,20220401_10-K_edgar_data_1511820_0001493152-22...,NEGATIVE
6,20220429_10-K_edgar_data_1676047_0001213900-22...,NEGATIVE
7,20220414_10-K_edgar_data_1637866_0001493152-22...,NEGATIVE
8,20220413_10-K_edgar_data_1868269_0001104659-22...,UNCERTAINTY
9,20220412_10-K_edgar_data_1652958_0001683168-22...,NEGATIVE
