In [6]:
import os
import pandas as pd
from datetime import datetime
from tqdm import tqdm
import re

## Download .idx master files in between required Time Period

In [2]:
import requests
import os
import time

# Base URL for the SEC EDGAR full index
base_url = 'https://www.sec.gov/Archives/edgar/full-index/'

# Function to download the file, now includes headers parameter
def download_file(url, path, headers):
    with requests.get(url, headers=headers, stream=True) as r:
        r.raise_for_status()
        with open(path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

# Prompt for start and end year
start_year = int(input("Enter the start year (YYYY): "))
end_year = int(input("Enter the end year (YYYY): "))
save_dir = input('Please Input Path to Your Directory to Download Files:')

# Add your user-agent string here
headers = {'User-Agent': 'useremail@email.com'}

# Iterate over each year and quarter within the specified range
for year in range(start_year, end_year + 1):
    for quarter in ['QTR1', 'QTR2', 'QTR3', 'QTR4']:
        file_url = f"{base_url}{year}/{quarter}/company.idx"
        save_path = os.path.join(save_dir, f"{year}_{quarter}_company.idx")

        print(f"Attempting to download {file_url}...")

        # Make the download attempt
        try:
            download_file(file_url, save_path, headers)
            print(f"Successfully downloaded {file_url}")
        except Exception as e:
            print(f"Failed to download {file_url}. Error: {e}")

        # Respect the SEC's rate limiting
        time.sleep(1)  # Sleep for 1 second to avoid hitting rate limit

print("All requested files have been attempted to download.")

Attempting to download https://www.sec.gov/Archives/edgar/full-index/2013/QTR1/company.idx...
Successfully downloaded https://www.sec.gov/Archives/edgar/full-index/2013/QTR1/company.idx
Attempting to download https://www.sec.gov/Archives/edgar/full-index/2013/QTR2/company.idx...
Successfully downloaded https://www.sec.gov/Archives/edgar/full-index/2013/QTR2/company.idx
Attempting to download https://www.sec.gov/Archives/edgar/full-index/2013/QTR3/company.idx...
Successfully downloaded https://www.sec.gov/Archives/edgar/full-index/2013/QTR3/company.idx
Attempting to download https://www.sec.gov/Archives/edgar/full-index/2013/QTR4/company.idx...
Successfully downloaded https://www.sec.gov/Archives/edgar/full-index/2013/QTR4/company.idx
Attempting to download https://www.sec.gov/Archives/edgar/full-index/2014/QTR1/company.idx...
Successfully downloaded https://www.sec.gov/Archives/edgar/full-index/2014/QTR1/company.idx
Attempting to download https://www.sec.gov/Archives/edgar/full-index/2

## Combine all the .idx files into a single dataframe/.csv file

In [3]:
import pandas as pd
import os

# Adjusting pandas display options for more optimized data viewing
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.width', None)  # Automatically adjust display width to terminal size
pd.set_option('display.max_colwidth', None)  # Display full content of each cell

#Load data from all EDGAR index files in the specified director
def load_data_from_directory(source_dir):
    colspecs = [(0, 62), (62, 74), (74, 86), (86, 98), (98, None)]
    column_names = ['Company Name', 'Form Type', 'CIK', 'Date Filed', 'Filename']
    dataframe_collection = []

    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        if file_name.endswith('.idx'):  # Check for .idx files
            file_path = os.path.join(source_dir, file_name)
            try:
                # Read fixed-width file with specified columns and skip header rows
                temp_df = pd.read_fwf(file_path, colspecs=colspecs, skiprows=9, names=column_names)
                dataframe_collection.append(temp_df)
            except UnicodeDecodeError as e:
                print(f'Error reading {file_name}: {e}')
                continue
            except Exception as e:
                print(f'An unexpected error occurred while reading {file_name}: {e}')
                continue

    if not dataframe_collection:
        print("No data was loaded. Please check your file paths and names.")
        return pd.DataFrame()

    # Concatenate all DataFrames into one DataFrame
    combined_df = pd.concat(dataframe_collection, ignore_index=True)
    combined_df.columns = combined_df.columns.str.strip()  # Strip any leading/trailing whitespace from column names
    return combined_df

def save_to_csv(df, output_path):
    """Save DataFrame to a CSV file."""
    try:
        df.to_csv(output_path, index=False)
        print(f"Data saved successfully to {output_path}")
    except Exception as e:
        print(f"Failed to save the DataFrame: {e}")

# Main execution logic
if __name__ == "__main__":
    source_directory = input('Enter/path/to/data/directory: ')  # Get directory containing the data files from user
    csv_name = input('Enter the filename for the CSV (e.g., combined_data.csv): ')
    output_path = os.path.join(source_directory, csv_name)  # Construct the full path to save the CSV file

    # Load data from the specified directory
    all_data_df = load_data_from_directory(source_directory)

    # Save the data to a CSV file
    if not all_data_df.empty:
        save_to_csv(all_data_df, output_path)
    else:
        print("No data to save.")

all_data_df.head()

Data saved successfully to /Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/combined_data.csv


Unnamed: 0,Company Name,Form Type,CIK,Date Filed,Filename
0,--------------------------------------------------------------,------------,------------,------------,-------------------------------------------
1,1 800 FLOWERS COM INC,10-K,1084869,2020-09,-11 edgar/data/1084869/0001437749-20-019622.txt
2,1 800 FLOWERS COM INC,3,1084869,2020-07,-02 edgar/data/1084869/0001437749-20-014500.txt
3,1 800 FLOWERS COM INC,4,1084869,2020-09,-02 edgar/data/1084869/0001437749-20-019189.txt
4,1 800 FLOWERS COM INC,4,1084869,2020-09,-02 edgar/data/1084869/0001437749-20-019193.txt


## Load the combined data csv file

In [15]:
output_path = "/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/combined_data.csv" # No need to run this if you are running the code in one go
all_data_df = pd.read_csv(output_path) # If running seperately wihtout the download or comined data then replace output_path with the path to the csv file

  all_data_df = pd.read_csv(output_path) # If running seperately wihtout the download or comined data then replace output_path with the path to the csv file


## To extract the html content from the company filings

In [16]:
#This one works
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_filing_html_directly(row, user_agent_email):
    """
    Extracts the actual 10-K filing HTML content from a row in .idx using the real HTML URL.
    """
    try:
        filename = row['Filename'].strip().replace(" ", "")
        path_parts = filename.split("/")

        if len(path_parts) < 4:
            print(f"Invalid path in Filename: {filename}")
            return None, None

        cik = path_parts[2]
        accession_with_dashes = path_parts[3]
        accession_nodashes = accession_with_dashes.replace("-", "")
        index_filename = accession_with_dashes + "-index.htm"

        index_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{accession_nodashes}/{index_filename}"
        headers = {"User-Agent": user_agent_email}

        response = requests.get(index_url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"Failed to load index page: {index_url}")
            return None, None

        soup = BeautifulSoup(response.text, "html.parser")
        doc_table = soup.find("table", class_="tableFile")
        if doc_table is None:
            print(f"Could not find document table at: {index_url}")
            return None, None

        doc_link_tag = doc_table.find("a", href=lambda href: href and href.endswith(".htm") and not href.endswith("-index.htm"))
        if doc_link_tag is None:
            print(f"No .htm filing document found in index page: {index_url}")
            return None, None

        primary_doc = doc_link_tag['href'].lstrip("/")  # remove leading slash
        filing_url = f"https://www.sec.gov/{primary_doc}"  # FIXED — no double Archives

        filing_response = requests.get(filing_url, headers=headers, timeout=15)
        if filing_response.status_code == 200:
            print(f"Downloaded: {filing_url}")
            return filing_url, filing_response.text
        else:
            print(f"Failed to download filing from: {filing_url}")
            return filing_url, None

    except Exception as e:
        print(f"Exception occurred: {e}")
        return None, None

In [None]:
# def download_multiple_10k_filings(df, user_agent_email):
#     """
#     Show how many 10-Ks are available, let the user choose how many to download,
#     and return a DataFrame with filing metadata and text.
#     """
#     # Step 1: Filter for 10-Ks
#     tenk_df = df[df['Form Type'].str.upper() == '10-K'].reset_index(drop=True)
#     total = len(tenk_df)

#     if total == 0:
#         print("No 10-K filings found in the dataset.")
#         return pd.DataFrame()

#     print(f"Found {total} 10-K filings in the dataset.")
    
#     # Step 2: Ask user for how many to download
#     while True:
#         try:
#             limit = int(input(f"Enter the number of 10-K filings to download (1 to {total}): "))
#             if 1 <= limit <= total:
#                 break
#             else:
#                 print(f"Please enter a number between 1 and {total}.")
#         except ValueError:
#             print("Please enter a valid integer.")

#     # Step 3: Download filings
#     results = []
#     for idx, row in tenk_df.head(limit).iterrows():
#         url, html_text = extract_filing_html_directly(row, user_agent_email)
#         if html_text:
#             results.append({
#                 "Company Name": row['Company Name'],
#                 "CIK": row['CIK'],
#                 "Date Filed": row['Date Filed'],
#                 "Filing URL": url,
#                 "Filing Text": html_text
#             })

#     return pd.DataFrame(results)

In [None]:
# df = input("Please input the dataframe name if loaded already:")

# # Downloading mulitple 10K filings
# filings_df = download_multiple_10k_filings(df, "nareshchandra.chethala@gmail.com")

# # Preview
# print(filings_df[['Company Name', 'Filing URL']].head())

In [None]:
# sample_row = df[df['Form Type'] == '10-K'].iloc[0]
# url, html_text = extract_filing_html_directly(sample_row, "nareshchandra.chethala@gmail.com")

# if html_text:
#     print(" Filing text preview:\n")
#     print(html_text[:1000])
# else:
#     print("No filing text returned.")

In [None]:
# print(html_text)

In [17]:
from bs4 import BeautifulSoup

def clean_filing_html(filing_html):
    """
    Cleans the full HTML of a 10-K filing to extract readable plain text.
    Removes scripts, styles, and unnecessary whitespace.
    """
    try:
        soup = BeautifulSoup(filing_html, "html.parser")

        # Remove unwanted tags
        for tag in soup(["script", "style", "header", "footer", "nav", "noscript"]):
            tag.decompose()

        # Extract text from the body if present
        body = soup.find("body")
        raw_text = body.get_text(separator="\n") if body else soup.get_text(separator="\n")

        # Normalize whitespace
        lines = [line.strip() for line in raw_text.splitlines()]
        clean_text = "\n".join(line for line in lines if line)

        return clean_text

    except Exception as e:
        print(f" Error cleaning HTML: {e}")
        return ""

In [None]:
# clean_text = clean_filing_html(html_text)
# print(" Cleaned text preview:\n")
# print(clean_text)

In [None]:
# print(len(clean_text))

In [18]:
def download_multiple_10k_filings(df, user_agent_email):
    """
    Show how many 10-Ks are available, let the user choose how many to download,
    and return a DataFrame with filing metadata and text.
    """
    tenk_df = df[df['Form Type'].str.upper() == '10-K'].reset_index(drop=True)
    total = len(tenk_df)

    if total == 0:
        print("No 10-K filings found in the dataset.")
        return pd.DataFrame()

    print(f"🔍 Found {total} 10-K filings in the dataset.")
    
    while True:
        try:
            limit = int(input(f"Enter the number of 10-K filings to download (1 to {total}): "))
            if 1 <= limit <= total:
                break
            else:
                print(f"⚠️ Please enter a number between 1 and {total}.")
        except ValueError:
            print("⚠️ Please enter a valid integer.")

    results = []
    for idx, row in tenk_df.head(limit).iterrows():
        url, html_text = extract_filing_html_directly(row, user_agent_email)
        if html_text:
            cleaned_text = clean_filing_html(html_text)
            results.append({
                "Company Name": row['Company Name'],
                "CIK": row['CIK'],
                "Date Filed": row['Date Filed'],
                "Filing URL": url,
                "Filing Text": html_text,
                "Cleaned Text": cleaned_text
            })

    return pd.DataFrame(results)

In [20]:
df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/combined_data.csv")

# Run the full process
filings_df = download_multiple_10k_filings(df, "nareshchandra.chethala@gmail.com")

# Preview results
print(filings_df[["Company Name", "Filing URL"]].head())
print("\n Sample Cleaned Filing Text:\n")
print(filings_df["Cleaned Text"][0][:2000])

  df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/combined_data.csv")


🔍 Found 82134 10-K filings in the dataset.
Downloaded: https://www.sec.gov/Archives/edgar/data/1084869/000143774920019622/flws20200628b_10k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/1667615/000107878220000695/f10k053120_10k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/1753648/000149315220017860/form10-k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/1591588/000156459020043316/amrk-10k_20200630.htm
Downloaded: https://www.sec.gov/ix?doc=/Archives/edgar/data/1750/000110465920085310/air-20200531x10k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/1300938/000118518520001098/abcoenergy20191231_10k.htm
Downloaded: https://www.sec.gov/ix?doc=/Archives/edgar/data/1138723/000156459020041201/aray-10k_20200630.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/849401/000143774920015110/admt20200331_10k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/828530/000155335020000885/adfk_10k.htm
Downloaded: https://www.sec.gov/Archives/edgar/data/352

In [21]:
filings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  50 non-null     object
 1   CIK           50 non-null     object
 2   Date Filed    50 non-null     object
 3   Filing URL    50 non-null     object
 4   Filing Text   50 non-null     object
 5   Cleaned Text  50 non-null     object
dtypes: object(6)
memory usage: 2.5+ KB


In [22]:
filings_df = filings_df.dropna()

In [23]:
filings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  50 non-null     object
 1   CIK           50 non-null     object
 2   Date Filed    50 non-null     object
 3   Filing URL    50 non-null     object
 4   Filing Text   50 non-null     object
 5   Cleaned Text  50 non-null     object
dtypes: object(6)
memory usage: 2.5+ KB


In [26]:
filings_df["Cleaned Text"].head(1)

0    Table of Contents\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION\nWASHINGTON, D.C. 20549\nFORM 10-K\n☒     ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the fiscal year ended\nJune 28, 2020\nor\n☐     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nCommission File No.\n0-26841\n1-800-FLOWERS.COM, Inc.\n(Exact name of registrant as specified in its charter)\nDELAWARE\n(State or other jurisdiction of incorporation or organization)\n11-3117311\n(I.R.S. Employer Identification No.)\nOne Old Country Road, Carle Place, New York, 11514\n(Address of principal executive offices) (Zip code)\n(516) 237-6000\n(Registrant’s telephone number, including area code)\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading symbol(s)\nName of each exchange on which registered\nClass A common stock\nFLWS\nThe Nasdaq Stock Market\nSecurities registered pursuant to Section 12(g) of

In [24]:
#filings_df["Cleaned Text"].head(1)
print(filings_df["Cleaned Text"][0][:2000])

Table of Contents
UNITED STATES SECURITIES AND EXCHANGE COMMISSION
WASHINGTON, D.C. 20549
FORM 10-K
☒     ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the fiscal year ended
June 28, 2020
or
☐     TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
Commission File No.
0-26841
1-800-FLOWERS.COM, Inc.
(Exact name of registrant as specified in its charter)
DELAWARE
(State or other jurisdiction of incorporation or organization)
11-3117311
(I.R.S. Employer Identification No.)
One Old Country Road, Carle Place, New York, 11514
(Address of principal executive offices) (Zip code)
(516) 237-6000
(Registrant’s telephone number, including area code)
Securities registered pursuant to Section 12(b) of the Act:
Title of each class
Trading symbol(s)
Name of each exchange on which registered
Class A common stock
FLWS
The Nasdaq Stock Market
Securities registered pursuant to Section 12(g) of the Act: None
Indicate by check 

In [27]:
import pandas as pd
import re

def extract_10k_sections(text):
    """
    Extracts Item 1, Item 7, and Item 7A sections from the plain 10-K text.
    Returns a dictionary with each section's content.
    """
    if not isinstance(text, str) or len(text) < 100:
        return {"Item 1": "", "Item 7": "", "Item 7A": ""}
    
    patterns = {
        "Item 1": r"(item\s*1[\.: \-–—]*business)",
        "Item 7": r"(item\s*7[\.: \-–—]*management[’'`s]{0,2} discussion.*?)",
        "Item 7A": r"(item\s*7a[\.: \-–—]*quantitative.*?)"
    }

    matches = []
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            matches.append((key, match.start()))

    matches.sort(key=lambda x: x[1])
    
    sections = {}
    for i in range(len(matches)):
        section_name, start = matches[i]
        end = matches[i+1][1] if i + 1 < len(matches) else len(text)
        sections[section_name] = text[start:end].strip()

    return {
        "Item 1": sections.get("Item 1", ""),
        "Item 7": sections.get("Item 7", ""),
        "Item 7A": sections.get("Item 7A", "")
    }

In [28]:
# Assuming your DataFrame is named df and "Cleaned Text" column holds the plain text
sections_df = filings_df["Cleaned Text"].apply(extract_10k_sections).apply(pd.Series)

# Add the extracted sections as new columns
filings_df["Item 1 Text"] = sections_df["Item 1"]
filings_df["Item 7 Text"] = sections_df["Item 7"]
filings_df["Item 7A Text"] = sections_df["Item 7A"]

In [29]:
print(filings_df[["Item 1 Text", "Item 7 Text", "Item 7A Text"]].isnull().sum())
print(filings_df[["Item 1 Text", "Item 7 Text", "Item 7A Text"]].eq("").sum())

Item 1 Text     0
Item 7 Text     0
Item 7A Text    0
dtype: int64
Item 1 Text     30
Item 7 Text     25
Item 7A Text    30
dtype: int64


In [30]:
filings_df.describe()
filings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Company Name  50 non-null     object
 1   CIK           50 non-null     object
 2   Date Filed    50 non-null     object
 3   Filing URL    50 non-null     object
 4   Filing Text   50 non-null     object
 5   Cleaned Text  50 non-null     object
 6   Item 1 Text   50 non-null     object
 7   Item 7 Text   50 non-null     object
 8   Item 7A Text  50 non-null     object
dtypes: object(9)
memory usage: 3.6+ KB


In [31]:
df_fin = filings_df[["Company Name", "CIK", "Date Filed", "Filing URL", "Item 1 Text", "Item 7 Text", "Item 7A Text"]]

In [21]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [32]:
lm_df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Loughran-McDonald_MasterDictionary_1993-2024.csv")

In [38]:
lm_df.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,AARDVARK,1,755,2.95507e-08,1.945421e-08,4.078069e-06,140,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.1742e-10,8.060019e-12,8.919011e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.5226e-10,1.089343e-10,5.105359e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.13506e-09,6.197922e-10,1.539279e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,9620,3.765268e-07,3.825261e-07,3.421836e-05,1295,0,0,0,0,0,0,0,0,3,12of12inf


In [37]:
# Filter only positive and negative words
positive_words = set(lm_df[lm_df["Positive"] > 0]["Word"].str.lower())
negative_words = set(lm_df[lm_df["Negative"] > 0]["Word"].str.lower())

In [26]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def compute_lm_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0

    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha() and w not in stop_words]

    pos = sum(1 for word in tokens if word in positive_words)
    neg = sum(1 for word in tokens if word in negative_words)
    total = len(tokens)

    return (pos - neg) / total if total > 0 else 0.0

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nareshchethala/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nareshchethala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
filings_df["Item1_LM_Sentiment"] = filings_df["Item 1 Text"].apply(compute_lm_sentiment)
filings_df["Item7_LM_Sentiment"] = filings_df["Item 7 Text"].apply(compute_lm_sentiment)
filings_df["Item7A_LM_Sentiment"] = filings_df["Item 7A Text"].apply(compute_lm_sentiment)

In [31]:
filings_df.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text,Item 1 Text,Item 7 Text,Item 7A Text,Item1_LM_Sentiment,Item7_LM_Sentiment,Item7A_LM_Sentiment
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...,,,,0.0,0.0,0.0
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,Item 1. Business.\nGeneral\nAs used in this An...,Item 7. Management’s Discussion and Analysis o...,,-0.005587,-0.013685,0.0
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,ITEM\n1. BUSINESS\nIntroduction\n8i\nEnterpris...,ITEM\n7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...,ITEM\n7A. QUANTITATIVE AND QUALITATIVE DISCLOS...,-0.003749,-0.006018,-0.010383
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,,,ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSU...,0.0,0.0,-0.009829
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,,,,0.0,0.0,0.0


In [48]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

# Load FinBERT model
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [49]:
from nltk.tokenize import sent_tokenize
import numpy as np
import nltk
nltk.download('punkt')

def get_finbert_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) < 10:
        return 0.0

    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]
    sentiments = []

    for chunk in chunks:
        try:
            result = finbert(chunk[:512])[0]
            score = {"positive": 1, "neutral": 0, "negative": -1}[result['label'].lower()]
            sentiments.append(score)
        except:
            continue

    return np.mean(sentiments) if sentiments else 0.0

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nareshchethala/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
df_fin["Item7_FinBERT_Sentiment"] = df_fin["Item 7 Text"].apply(get_finbert_sentiment)
df_fin["Item1_FinBERT_Sentiment"] = df_fin["Item 1 Text"].apply(get_finbert_sentiment)
df_fin["Item7A_FinBERT_Sentiment"] = df_fin["Item 7A Text"].apply(get_finbert_sentiment)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fin["Item7_FinBERT_Sentiment"] = df_fin["Item 7 Text"].apply(get_finbert_sentiment)


In [51]:
df_fin.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Item 1 Text,Item 7 Text,Item 7A Text,Item7_FinBERT_Sentiment,Item1_FinBERT_Sentiment,Item7A_FinBERT_Sentiment
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,,,,0.0,0.0,0.0
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,Item 1. Business.\nGeneral\nAs used in this An...,Item 7. Management’s Discussion and Analysis o...,,-0.070175,0.083333,0.0
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,ITEM\n1. BUSINESS\nIntroduction\n8i\nEnterpris...,ITEM\n7. MANAGEMENT’S DISCUSSION AND ANALYSIS ...,ITEM\n7A. QUANTITATIVE AND QUALITATIVE DISCLOS...,-0.083333,0.113636,-0.027273
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,,,ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSU...,0.0,0.0,0.0
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,,,,0.0,0.0,0.0


In [None]:
# # Root directory of your data
# root_dir = '/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Files'

# # Store extracted records
# records = []

# # Traverse all year folders with tqdm
# for year in tqdm(os.listdir(root_dir), desc="Years"):
#     year_path = os.path.join(root_dir, year)
#     if not os.path.isdir(year_path):
#         continue

#     # Traverse quarter folders
#     for quarter in os.listdir(year_path):
#         quarter_path = os.path.join(year_path, quarter)
#         if not os.path.isdir(quarter_path):
#             continue

#         # Traverse all files in quarter folder
#         for filename in os.listdir(quarter_path):
#             # Match only pure 10-K files 
#             if (
#                 '10-K' in filename.upper() and 
#                 '10-K/A' not in filename.upper() and 
#                 '10-K-A' not in filename.upper() and 
#                 filename.lower().endswith('.txt')
#             ):
#                 try:
#                     file_path = os.path.join(quarter_path, filename)

#                     # Read file content
#                     with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
#                         text = file.read()

#                     # Split filename into parts
#                     parts = filename.split('_')
#                     filing_date = parts[0]
#                     filing_type = parts[1]
#                     cik = parts[3]
#                     accession = parts[4].replace('.txt', '')

#                     # Parse date
#                     date_obj = datetime.strptime(filing_date, "%Y%m%d")
#                     year_parsed = date_obj.year
#                     month = date_obj.month
#                     day = date_obj.day

#                     # Append record
#                     records.append({
#                         'year_folder': year,
#                         'quarter_folder': quarter,
#                         'filing_date': filing_date,
#                         'year': year_parsed,
#                         'month': month,
#                         'day': day,
#                         'filing_type': filing_type,
#                         'cik': cik,
#                         'accession': accession,
#                         'filename': filename,
#                         'text': text
#                     })

#                 except Exception as e:
#                     print(f" Error reading {file_path}: {e}")

Years: 100%|██████████████████████████████████████| 5/5 [00:42<00:00,  8.43s/it]


In [None]:
#df_10k_final = pd.DataFrame(records)

In [None]:
#df_10k_final.columns

Index(['year_folder', 'quarter_folder', 'filing_date', 'year', 'month', 'day',
       'filing_type', 'cik', 'accession', 'filename', 'text'],
      dtype='object')

In [None]:
#df_10k_final.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...


In [None]:
#df_10k_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29302 entries, 0 to 29301
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   year_folder     29302 non-null  object
 1   quarter_folder  29302 non-null  object
 2   filing_date     29302 non-null  object
 3   year            29302 non-null  int64 
 4   month           29302 non-null  int64 
 5   day             29302 non-null  int64 
 6   filing_type     29302 non-null  object
 7   cik             29302 non-null  object
 8   accession       29302 non-null  object
 9   filename        29302 non-null  object
 10  text            29302 non-null  object
dtypes: int64(3), object(8)
memory usage: 2.5+ MB


In [None]:
#df_10k_final.to_csv("10K_filings_all_years.csv", index=False)

In [None]:
#pwd

'/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project'

In [13]:
import re

def clean_10k_text(text):
    if pd.isna(text):
        return ""

    # Remove HTML/XML tags (if any)
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove SEC header section if present
    text = re.sub(r'(?s)<SEC-Header>.*?</SEC-Header>', ' ', text)
    text = re.sub(r'(?s)<Header>.*?</Header>', ' ', text)

    # Remove file stats or metadata-like sections
    text = re.sub(r'(?i)file name.*?\.txt', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove duplicate underscores, dashes, and asterisks
    text = re.sub(r'[_*=-]{2,}', ' ', text)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    # Strip leading/trailing spaces
    return text.strip()

In [None]:
#df_100=df.head(100)

In [14]:
filings_df.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",


In [15]:
df_100 = filings_df.head(100)

In [17]:
df_100["cleaned_text"] = df_100["Cleaned Text"].apply(clean_10k_text)  #11mins

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_100["cleaned_text"] = df_100["Cleaned Text"].apply(clean_10k_text)  #11mins


In [18]:
display(df_100)

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text,cleaned_text
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...,Table of Contents\nUNITED STATES SECURITIES AN...
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,
...,...,...,...,...,...,...,...
95,CONAGRA BRANDS INC.,23217,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,
96,CONCIERGE TECHNOLOGIES INC,1005101,2020-09,https://www.sec.gov/Archives/edgar/data/100510...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES\nSECURITIES A...,Table of Contents\nUNITED STATES\nSECURITIES A...
97,CONSUMERS BANCORP INC /OH/,1006830,2020-09,https://www.sec.gov/Archives/edgar/data/100683...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...
98,COPART INC,900075,2020-09,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,


In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
model.to(device)
model.eval()

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [25]:
def chunk_text_by_tokens(text, tokenizer, max_tokens=512):
    tokens = tokenizer.tokenize(text)
    chunks = []

    for i in range(0, len(tokens), max_tokens - 50):  # 50 buffer for [CLS], [SEP], etc.
        chunk_tokens = tokens[i:i + max_tokens - 50]
        chunk = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk)

    return chunks

In [24]:
def get_sentiment_scores(chunks):
    sentiments = []
    for chunk in chunks:
        # Tokenize with truncation and ensure it's not oversized
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        # Safety check: discard if too long after tokenization (rare but cautious)
        if inputs['input_ids'].shape[1] > 512:
            print("Skipping oversized chunk")
            continue

        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        label = torch.argmax(probs, dim=1).item()
        sentiments.append(label)

    return sentiments

In [28]:
from collections import Counter

def aggregate_sentiment(sentiments):
    if not sentiments:  # catch empty list
        return {"negative": 0, "neutral": 0, "positive": 0}

    count = Counter(sentiments)
    total = len(sentiments)
    return {
        "negative": count[0] / total,
        "neutral": count[1] / total,
        "positive": count[2] / total
    }

In [29]:
from tqdm import tqdm

sentiment_results = []

for i, row in tqdm(df_100.iterrows(), total=len(df_100)):
    try:
        print(f"Processing {i+1}/{len(df_100)}: {row['Company Name']}")

        # Chunk using token-aware chunking
        text_chunks = chunk_text_by_tokens(row['cleaned_text'], tokenizer)

        # Get chunk-level sentiments
        chunk_sentiments = get_sentiment_scores(text_chunks)

        # Aggregate to document-level sentiment
        result = aggregate_sentiment(chunk_sentiments)

    except Exception as e:
        print(f"Error processing row {i}: {e}")
        result = {"negative": None, "neutral": None, "positive": None}

    sentiment_results.append(result)

  0%|          | 0/100 [00:00<?, ?it/s]

Processing 1/100: 1 800 FLOWERS COM INC


  1%|          | 1/100 [01:01<1:41:17, 61.39s/it]

Processing 2/100: 3AM TECHNOLOGIES INC


  2%|▏         | 2/100 [01:11<51:18, 31.41s/it]  

Processing 3/100: 8i Enterprises Acquisition Corp.


  3%|▎         | 3/100 [01:51<57:10, 35.36s/it]

Processing 4/100: A-Mark Precious Metals, Inc.


  4%|▍         | 4/100 [03:16<1:27:51, 54.91s/it]

Processing 5/100: AAR CORP
Processing 6/100: ABCO Energy, Inc.


  6%|▌         | 6/100 [03:47<53:17, 34.01s/it]  

Processing 7/100: ACCURAY INC
Processing 8/100: ADM TRONICS UNLIMITED, INC.


  8%|▊         | 8/100 [04:09<36:52, 24.04s/it]

Processing 9/100: ADMIRAL FINANCIAL CORP


  9%|▉         | 9/100 [04:16<30:19, 19.99s/it]

Processing 10/100: ADVANCED OXYGEN TECHNOLOGIES INC


 10%|█         | 10/100 [04:39<31:18, 20.87s/it]

Processing 11/100: AEHR TEST SYSTEMS


 11%|█         | 11/100 [05:24<40:18, 27.18s/it]

Processing 12/100: ALKALINE WATER Co INC


 12%|█▏        | 12/100 [06:09<47:02, 32.07s/it]

Processing 13/100: ALLIED HEALTHCARE PRODUCTS INC


 13%|█▎        | 13/100 [06:37<44:44, 30.86s/it]

Processing 14/100: ALPHA & OMEGA SEMICONDUCTOR Ltd
Processing 15/100: ALPHA ENERGY INC


 15%|█▌        | 15/100 [07:02<32:05, 22.65s/it]

Processing 16/100: AMERICAN SOFTWARE INC


 16%|█▌        | 16/100 [07:54<41:39, 29.75s/it]

Processing 17/100: AMERITYRE CORP
Processing 18/100: AMMO, INC.


 18%|█▊        | 18/100 [08:49<39:33, 28.95s/it]

Processing 19/100: AMREP CORP.


 19%|█▉        | 19/100 [09:20<39:39, 29.38s/it]

Processing 20/100: ANDES 7 INC.


 20%|██        | 20/100 [09:38<35:16, 26.46s/it]

Processing 21/100: ANGIODYNAMICS INC
Processing 22/100: ANVI GLOBAL HOLDINGS, INC.


 22%|██▏       | 22/100 [09:52<23:59, 18.46s/it]

Processing 23/100: APPIPHANY TECHNOLOGIES HOLDINGS CORP


 23%|██▎       | 23/100 [10:16<25:18, 19.72s/it]

Processing 24/100: APPLIED GENETIC TECHNOLOGIES CORP


 24%|██▍       | 24/100 [11:50<48:15, 38.10s/it]

Processing 25/100: APPLIED INDUSTRIAL TECHNOLOGIES INC
Processing 26/100: APPlife Digital Solutions Inc


 26%|██▌       | 26/100 [12:05<31:18, 25.39s/it]

Processing 27/100: ASPEN GROUP, INC.
Processing 28/100: ASTROTECH Corp


 28%|██▊       | 28/100 [12:53<29:52, 24.90s/it]

Processing 29/100: AURA SYSTEMS INC


 29%|██▉       | 29/100 [13:31<32:41, 27.62s/it]

Processing 30/100: AUTOMATIC DATA PROCESSING INC
Processing 31/100: AVIAT NETWORKS, INC.
Processing 32/100: AVNET INC
Processing 33/100: AXELEREX CORP.


 33%|███▎      | 33/100 [13:44<16:03, 14.38s/it]

Processing 34/100: Achison Inc


 34%|███▍      | 34/100 [13:55<15:10, 13.80s/it]

Processing 35/100: Addus HomeCare Corp
Processing 36/100: Adtalem Global Education Inc.
Processing 37/100: Akerna Corp.


 37%|███▋      | 37/100 [14:39<14:52, 14.16s/it]

Processing 38/100: Akoustis Technologies, Inc.


 38%|███▊      | 38/100 [15:21<19:25, 18.80s/it]

Processing 39/100: All State Properties Holdings, Inc.


 39%|███▉      | 39/100 [15:32<17:31, 17.24s/it]

Processing 40/100: All State Properties Holdings, Inc.


 40%|████      | 40/100 [15:43<15:50, 15.84s/it]

Processing 41/100: Amcor plc
Processing 42/100: Amesite Inc.


 42%|████▏     | 42/100 [16:09<14:21, 14.86s/it]

Processing 43/100: Anvia Holdings Corp


 43%|████▎     | 43/100 [16:40<17:17, 18.21s/it]

Processing 44/100: Ariel Clean Energy, Inc.
Processing 45/100: Arma Services Inc


 45%|████▌     | 45/100 [16:47<11:43, 12.78s/it]

Processing 46/100: Arrestage International, Inc.


 46%|████▌     | 46/100 [17:06<12:40, 14.08s/it]

Processing 47/100: Artificial Intelligence Technology Solutions Inc.


 47%|████▋     | 47/100 [17:43<17:07, 19.38s/it]

Processing 48/100: Avita Therapeutics, Inc.


 48%|████▊     | 48/100 [18:32<23:15, 26.83s/it]

Processing 49/100: Axos Financial, Inc.
Processing 50/100: B2Digital, Inc.


 50%|█████     | 50/100 [19:03<18:20, 22.00s/it]

Processing 51/100: BIO-TECHNE Corp
Processing 52/100: BIOMERICA INC


 52%|█████▏    | 52/100 [19:36<15:57, 19.94s/it]

Processing 53/100: BION ENVIRONMENTAL TECHNOLOGIES INC


 53%|█████▎    | 53/100 [20:38<22:32, 28.79s/it]

Processing 54/100: BIOSYNERGY INC


 54%|█████▍    | 54/100 [20:57<20:23, 26.60s/it]

Processing 55/100: BIOTRICITY INC.


 55%|█████▌    | 55/100 [21:38<22:35, 30.11s/it]

Processing 56/100: BIOVIE INC.


 56%|█████▌    | 56/100 [22:18<24:05, 32.84s/it]

Processing 57/100: BLACK CACTUS GLOBAL, INC.


 57%|█████▋    | 57/100 [22:38<20:58, 29.27s/it]

Processing 58/100: BOTTOMLINE TECHNOLOGIES INC /DE/
Processing 59/100: BOWL AMERICA INC


 59%|█████▉    | 59/100 [22:51<13:09, 19.27s/it]

Processing 60/100: BRADY CORP
Processing 61/100: BRIDGEWAY NATIONAL CORP.


 61%|██████    | 61/100 [23:15<10:46, 16.57s/it]

Processing 62/100: BRINKER INTERNATIONAL, INC
Processing 63/100: BROADRIDGE FINANCIAL SOLUTIONS, INC.
Processing 64/100: BUTLER NATIONAL CORP


 64%|██████▍   | 64/100 [23:39<07:33, 12.61s/it]

Processing 65/100: Barnes & Noble Education, Inc.


 65%|██████▌   | 65/100 [24:41<12:29, 21.40s/it]

Processing 66/100: Benitec Biopharma Inc.


 66%|██████▌   | 66/100 [25:48<17:19, 30.58s/it]

Processing 67/100: Bill.com Holdings, Inc.
Processing 68/100: Blox, Inc.


 68%|██████▊   | 68/100 [26:12<12:41, 23.81s/it]

Processing 69/100: Blubuzzard, Inc.


 69%|██████▉   | 69/100 [26:26<11:16, 21.81s/it]

Processing 70/100: Blue Line Protection Group, Inc.


 70%|███████   | 70/100 [26:53<11:27, 22.91s/it]

Processing 71/100: Bunker Hill Mining Corp.


 71%|███████   | 71/100 [27:25<12:11, 25.24s/it]

Processing 72/100: C & C TOURS INC


 72%|███████▏  | 72/100 [27:35<09:50, 21.10s/it]

Processing 73/100: CACI INTERNATIONAL INC /DE/
Processing 74/100: CAL-MAINE FOODS INC
Processing 75/100: CAMPBELL SOUP CO
Processing 76/100: CANNABIS SUISSE CORP.


 76%|███████▌  | 76/100 [27:35<03:24,  8.52s/it]

Processing 77/100: CANTEL MEDICAL CORP
Processing 78/100: CARDINAL HEALTH INC
Processing 79/100: CARPENTER TECHNOLOGY CORP
Processing 80/100: CARVER BANCORP INC
Processing 81/100: CCUR Holdings, Inc.


 81%|████████  | 81/100 [28:24<02:55,  9.24s/it]

Processing 82/100: CDK Global, Inc.
Processing 83/100: CHAMPIONS ONCOLOGY, INC.


 83%|████████▎ | 83/100 [29:00<03:10, 11.20s/it]

Processing 84/100: CHARLES & COLVARD LTD


 84%|████████▍ | 84/100 [30:08<05:05, 19.08s/it]

Processing 85/100: CHASE GENERAL CORP


 85%|████████▌ | 85/100 [30:27<04:46, 19.13s/it]

Processing 86/100: CHINA JO-JO DRUGSTORES, INC.


 86%|████████▌ | 86/100 [31:37<06:45, 28.98s/it]

Processing 87/100: CHINA MEDIA INC.


 87%|████████▋ | 87/100 [31:53<05:39, 26.15s/it]

Processing 88/100: CIMPRESS plc
Processing 89/100: CINTAS CORP
Processing 90/100: CISCO SYSTEMS, INC.
Processing 91/100: CLOROX CO /DE/
Processing 92/100: CLS Holdings USA, Inc.


 92%|█████████▏| 92/100 [33:08<02:34, 19.36s/it]

Processing 93/100: COCOLUV INC.


 93%|█████████▎| 93/100 [33:18<02:06, 18.04s/it]

Processing 94/100: COLLECTORS UNIVERSE INC


 94%|█████████▍| 94/100 [34:08<02:20, 23.48s/it]

Processing 95/100: COMTECH TELECOMMUNICATIONS CORP /DE/
Processing 96/100: CONAGRA BRANDS INC.
Processing 97/100: CONCIERGE TECHNOLOGIES INC


 97%|█████████▋| 97/100 [34:44<00:56, 18.68s/it]

Processing 98/100: CONSUMERS BANCORP INC /OH/


100%|██████████| 100/100 [35:17<00:00, 21.17s/it]

Processing 99/100: COPART INC
Processing 100/100: COTY INC.





In [30]:
# Convert to DataFrame
sentiment_df = pd.DataFrame(sentiment_results)

# Combine with your original df
final_df = pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)

In [31]:
final_df.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text,cleaned_text,negative,neutral,positive
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...,Table of Contents\nUNITED STATES SECURITIES AN...,0.666667,0.142857,0.190476
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,0.928571,0.0,0.071429
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,0.978947,0.021053,0.0
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,0.877301,0.02454,0.09816
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,,0.0,0.0,0.0


In [32]:
final_df[['positive', 'neutral', 'negative']].describe()

Unnamed: 0,positive,neutral,negative
count,100.0,100.0,100.0
mean,0.063315,0.012926,0.553759
std,0.07575,0.024526,0.43233
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.02628,0.0,0.799866
75%,0.126111,0.016841,0.925893
max,0.245902,0.142857,1.0


In [33]:
avg_sentiment = final_df[['positive', 'neutral', 'negative']].mean()
print("Average Sentiment Scores:\n", avg_sentiment)

Average Sentiment Scores:
 positive    0.063315
neutral     0.012926
negative    0.553759
dtype: float64


In [34]:
def dominant_sentiment(row):
    return max(row, key=row.get)

final_df['dominant_sentiment'] = final_df[['positive', 'neutral', 'negative']].apply(dominant_sentiment, axis=1)

# Count occurrences
final_df['dominant_sentiment'].value_counts()

TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'

In [42]:
import re

# def extract_section(text, item_title):
#     """
#     Extracts the section of a 10-K filing based on the provided item title.
#     """
#     text = text.upper()
#     item_title = item_title.upper().strip()

#     # Handle different spacing and numbering formats (e.g., ITEM 1A, ITEM1A.)
#     pattern = rf"{item_title}.*?(?=ITEM\s+\d+[A-Z]?[.\s])"
#     match = re.search(pattern, text, re.DOTALL)
#     if match:
#         return match.group().strip()
#     return ""

def extract_section(text, item_title):
    text = text.upper()
    item_title = item_title.upper().strip()
    pattern = rf"{item_title}.*?(?=ITEM\s+\d+[A-Z]?[.\s])"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group().strip()
    return ""

In [41]:
SECTION_MAP = {
    "risk": "ITEM 1A. RISK FACTORS",
    "mdna": "ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS",
    "business": "ITEM 1. BUSINESS"
}

In [43]:
from tqdm import tqdm

all_section_sentiments = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    row_result = {}

    for section_key, section_title in SECTION_MAP.items():
        try:
            section_text = extract_section(row['cleaned_text'], section_title)
            if not section_text or len(section_text.split()) < 20:
                row_result[f"{section_key}_sentiment_positive"] = 0
                row_result[f"{section_key}_sentiment_neutral"] = 0
                row_result[f"{section_key}_sentiment_negative"] = 0
                continue

            # Chunk, Analyze, Aggregate
            chunks = chunk_text_by_tokens(section_text, tokenizer)
            sentiments = get_sentiment_scores(chunks)
            aggregated = aggregate_sentiment(sentiments)

            # Store results
            for sentiment_type in ['positive', 'neutral', 'negative']:
                row_result[f"{section_key}_sentiment_{sentiment_type}"] = aggregated[sentiment_type]

        except Exception as e:
            print(f"Error in row {i}, section {section_key}: {e}")
            for sentiment_type in ['positive', 'neutral', 'negative']:
                row_result[f"{section_key}_sentiment_{sentiment_type}"] = None

    all_section_sentiments.append(row_result)

100%|██████████| 100/100 [02:43<00:00,  1.64s/it]


In [44]:
section_df = pd.DataFrame(all_section_sentiments)
final_df = pd.concat([df.reset_index(drop=True), section_df], axis=1)
final_df.to_csv("section_wise_10k_sentiment.csv", index=False)

In [49]:
final_df.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text,cleaned_text,risk_sentiment_positive,risk_sentiment_neutral,risk_sentiment_negative,mdna_sentiment_positive,mdna_sentiment_neutral,mdna_sentiment_negative,business_sentiment_positive,business_sentiment_neutral,business_sentiment_negative
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...,Table of Contents\nUNITED STATES SECURITIES AN...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,0.0,0.0,0.0,0.166667,0.0,0.833333,0.0,0.0,1.0
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
def get_sentiment_scores(chunks):
    sentiments = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512)
        
        if inputs['input_ids'].shape[1] > 512:
            print("Skipping oversized chunk")
            continue

        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        label = torch.argmax(probs, dim=1).item()
        sentiments.append(label)

    return sentiments

In [37]:
from tqdm import tqdm

def extract_relevant_sections(full_text):
    """
    Extracts important 10-K sections for sentiment analysis.
    """
    sections = [
        "ITEM 1A. RISK FACTORS",
        "ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS",
        "ITEM 1. BUSINESS"
    ]
    extracted = ""
    for section in sections:
        extracted += "\n" + extract_section(full_text, section)
    return extracted.strip()

In [38]:
sentiment_results = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        print(f"Processing {i+1}/{len(df)}: {row['Company Name']}")

        # Step 1: Extract key sections
        focus_text = extract_relevant_sections(row['cleaned_text'])

        # Step 2: Chunk by tokens
        text_chunks = chunk_text_by_tokens(focus_text, tokenizer)

        # Step 3: Get chunk-level sentiments
        chunk_sentiments = get_sentiment_scores(text_chunks)

        # Step 4: Aggregate to document-level
        result = aggregate_sentiment(chunk_sentiments)

    except Exception as e:
        print(f"Error on row {i}: {e}")
        result = {"negative": None, "neutral": None, "positive": None}

    sentiment_results.append(result)

  0%|          | 0/100 [00:00<?, ?it/s]

Processing 1/100: 1 800 FLOWERS COM INC
Processing 2/100: 3AM TECHNOLOGIES INC


  2%|▏         | 2/100 [00:03<02:48,  1.72s/it]

Processing 3/100: 8i Enterprises Acquisition Corp.
Processing 4/100: A-Mark Precious Metals, Inc.
Processing 5/100: AAR CORP
Processing 6/100: ABCO Energy, Inc.


  6%|▌         | 6/100 [00:07<01:50,  1.17s/it]

Processing 7/100: ACCURAY INC
Processing 8/100: ADM TRONICS UNLIMITED, INC.


  8%|▊         | 8/100 [00:11<02:20,  1.52s/it]

Processing 9/100: ADMIRAL FINANCIAL CORP
Processing 10/100: ADVANCED OXYGEN TECHNOLOGIES INC


 10%|█         | 10/100 [00:13<01:50,  1.23s/it]

Processing 11/100: AEHR TEST SYSTEMS


 11%|█         | 11/100 [00:13<01:32,  1.04s/it]

Processing 12/100: ALKALINE WATER Co INC


 12%|█▏        | 12/100 [00:24<04:47,  3.26s/it]

Processing 13/100: ALLIED HEALTHCARE PRODUCTS INC


 13%|█▎        | 13/100 [00:27<04:24,  3.05s/it]

Processing 14/100: ALPHA & OMEGA SEMICONDUCTOR Ltd
Processing 15/100: ALPHA ENERGY INC


 15%|█▌        | 15/100 [00:35<05:01,  3.54s/it]

Processing 16/100: AMERICAN SOFTWARE INC


 16%|█▌        | 16/100 [00:36<03:59,  2.85s/it]

Processing 17/100: AMERITYRE CORP
Processing 18/100: AMMO, INC.
Processing 19/100: AMREP CORP.
Processing 20/100: ANDES 7 INC.


 20%|██        | 20/100 [00:38<02:11,  1.64s/it]

Processing 21/100: ANGIODYNAMICS INC
Processing 22/100: ANVI GLOBAL HOLDINGS, INC.


 22%|██▏       | 22/100 [00:40<01:55,  1.48s/it]

Processing 23/100: APPIPHANY TECHNOLOGIES HOLDINGS CORP


 26%|██▌       | 26/100 [00:41<00:58,  1.27it/s]

Processing 24/100: APPLIED GENETIC TECHNOLOGIES CORP
Processing 25/100: APPLIED INDUSTRIAL TECHNOLOGIES INC
Processing 26/100: APPlife Digital Solutions Inc
Processing 27/100: ASPEN GROUP, INC.
Processing 28/100: ASTROTECH Corp


 28%|██▊       | 28/100 [00:59<03:42,  3.09s/it]

Processing 29/100: AURA SYSTEMS INC


 29%|██▉       | 29/100 [01:01<03:28,  2.94s/it]

Processing 30/100: AUTOMATIC DATA PROCESSING INC
Processing 31/100: AVIAT NETWORKS, INC.
Processing 32/100: AVNET INC
Processing 33/100: AXELEREX CORP.


 33%|███▎      | 33/100 [01:02<01:47,  1.61s/it]

Processing 34/100: Achison Inc


 34%|███▍      | 34/100 [01:03<01:36,  1.46s/it]

Processing 35/100: Addus HomeCare Corp
Processing 36/100: Adtalem Global Education Inc.
Processing 37/100: Akerna Corp.
Processing 38/100: Akoustis Technologies, Inc.


 38%|███▊      | 38/100 [01:06<01:14,  1.20s/it]

Processing 39/100: All State Properties Holdings, Inc.


 39%|███▉      | 39/100 [01:08<01:21,  1.33s/it]

Processing 40/100: All State Properties Holdings, Inc.


 40%|████      | 40/100 [01:10<01:27,  1.46s/it]

Processing 41/100: Amcor plc
Processing 42/100: Amesite Inc.


 45%|████▌     | 45/100 [01:16<01:00,  1.10s/it]

Processing 43/100: Anvia Holdings Corp
Processing 44/100: Ariel Clean Energy, Inc.
Processing 45/100: Arma Services Inc
Processing 46/100: Arrestage International, Inc.
Processing 47/100: Artificial Intelligence Technology Solutions Inc.


 47%|████▋     | 47/100 [01:24<01:43,  1.96s/it]

Processing 48/100: Avita Therapeutics, Inc.


 48%|████▊     | 48/100 [01:26<01:40,  1.94s/it]

Processing 49/100: Axos Financial, Inc.
Processing 50/100: B2Digital, Inc.
Processing 51/100: BIO-TECHNE Corp
Processing 52/100: BIOMERICA INC


 52%|█████▏    | 52/100 [01:39<02:01,  2.53s/it]

Processing 53/100: BION ENVIRONMENTAL TECHNOLOGIES INC


 55%|█████▌    | 55/100 [02:02<03:00,  4.01s/it]

Processing 54/100: BIOSYNERGY INC
Processing 55/100: BIOTRICITY INC.
Processing 56/100: BIOVIE INC.
Processing 57/100: BLACK CACTUS GLOBAL, INC.


 57%|█████▋    | 57/100 [02:06<02:23,  3.34s/it]

Processing 58/100: BOTTOMLINE TECHNOLOGIES INC /DE/
Processing 59/100: BOWL AMERICA INC


 59%|█████▉    | 59/100 [02:07<01:40,  2.46s/it]

Processing 60/100: BRADY CORP
Processing 61/100: BRIDGEWAY NATIONAL CORP.


 64%|██████▍   | 64/100 [02:07<00:39,  1.09s/it]

Processing 62/100: BRINKER INTERNATIONAL, INC
Processing 63/100: BROADRIDGE FINANCIAL SOLUTIONS, INC.
Processing 64/100: BUTLER NATIONAL CORP
Processing 65/100: Barnes & Noble Education, Inc.


 66%|██████▌   | 66/100 [02:08<00:31,  1.08it/s]

Processing 66/100: Benitec Biopharma Inc.
Processing 67/100: Bill.com Holdings, Inc.
Processing 68/100: Blox, Inc.
Processing 69/100: Blubuzzard, Inc.


 69%|██████▉   | 69/100 [02:10<00:25,  1.22it/s]

Processing 70/100: Blue Line Protection Group, Inc.
Processing 71/100: Bunker Hill Mining Corp.
Processing 72/100: C & C TOURS INC


 72%|███████▏  | 72/100 [02:12<00:21,  1.30it/s]

Processing 73/100: CACI INTERNATIONAL INC /DE/
Processing 74/100: CAL-MAINE FOODS INC
Processing 75/100: CAMPBELL SOUP CO
Processing 76/100: CANNABIS SUISSE CORP.
Processing 77/100: CANTEL MEDICAL CORP
Processing 78/100: CARDINAL HEALTH INC
Processing 79/100: CARPENTER TECHNOLOGY CORP
Processing 80/100: CARVER BANCORP INC
Processing 81/100: CCUR Holdings, Inc.


 81%|████████  | 81/100 [02:15<00:10,  1.86it/s]

Processing 82/100: CDK Global, Inc.
Processing 83/100: CHAMPIONS ONCOLOGY, INC.
Processing 84/100: CHARLES & COLVARD LTD
Processing 85/100: CHASE GENERAL CORP
Processing 86/100: CHINA JO-JO DRUGSTORES, INC.
Processing 87/100: CHINA MEDIA INC.
Processing 88/100: CIMPRESS plc
Processing 89/100: CINTAS CORP
Processing 90/100: CISCO SYSTEMS, INC.
Processing 91/100: CLOROX CO /DE/
Processing 92/100: CLS Holdings USA, Inc.


100%|██████████| 100/100 [02:44<00:00,  1.65s/it]

Processing 93/100: COCOLUV INC.
Processing 94/100: COLLECTORS UNIVERSE INC
Processing 95/100: COMTECH TELECOMMUNICATIONS CORP /DE/
Processing 96/100: CONAGRA BRANDS INC.
Processing 97/100: CONCIERGE TECHNOLOGIES INC
Processing 98/100: CONSUMERS BANCORP INC /OH/
Processing 99/100: COPART INC
Processing 100/100: COTY INC.





In [39]:
# Join results and export
sentiment_df = pd.DataFrame(sentiment_results)
final_df = pd.concat([df.reset_index(drop=True), sentiment_df], axis=1)
final_df.to_csv("10k_sentiment_focused_sections.csv", index=False)

In [40]:
final_df.head()

Unnamed: 0,Company Name,CIK,Date Filed,Filing URL,Filing Text,Cleaned Text,cleaned_text,negative,neutral,positive
0,1 800 FLOWERS COM INC,1084869,2020-09,https://www.sec.gov/Archives/edgar/data/108486...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,Table of Contents\nUNITED STATES SECURITIES AN...,Table of Contents\nUNITED STATES SECURITIES AN...,0.0,0.0,0.0
1,3AM TECHNOLOGIES INC,1667615,2020-09,https://www.sec.gov/Archives/edgar/data/166761...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,1.0,0.0,0.0
2,8i Enterprises Acquisition Corp.,1753648,2020-09,https://www.sec.gov/Archives/edgar/data/175364...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,UNITED\nSTATES\nSECURITIES\nAND EXCHANGE COMMI...,0.0,0.0,0.0
3,"A-Mark Precious Metals, Inc.",1591588,2020-09,https://www.sec.gov/Archives/edgar/data/159158...,<DOCUMENT>\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,UNITED STATES\nSECURITIES AND EXCHANGE COMMISS...,0.0,0.0,0.0
4,AAR CORP,1750,2020-07,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT...",,,0.0,0.0,0.0


In [None]:
# import pandas as pd
# # # Load your dataframe
# # df = pd.read_csv("your_cleaned_10k_dataframe.csv")

# # Optional: keep only non-null cleaned_text
# df = df_100[df_100['cleaned_text'].notnull()].reset_index(drop=True)

# # Storage
# sentiment_results = []

# # Iterate through each filing
# for i, row in df.iterrows():
#     print(f"Processing {i+1}/{len(df)}: {row['Company Name']}")
#     text_chunks = chunk_text(row['cleaned_text'])
#     chunk_sentiments = get_sentiment_scores(text_chunks)
#     result = aggregate_sentiment(chunk_sentiments)
#     sentiment_results.append(result)

# # Convert results to DataFrame
# sentiment_df = pd.DataFrame(sentiment_results)

# # Combine with original data
# final_df = pd.concat([df, sentiment_df], axis=1)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processing 1/100: 1 800 FLOWERS COM INC


RuntimeError: The size of tensor a (597) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
#df.to_csv("10K_filings_cleaned.csv", index=False)

In [7]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
import re

def extract_item(text, item_number):
    """
    Extracts section like ITEM 1A, ITEM 7 from 10-K text
    """
    pattern = rf'(ITEM\s+{item_number}[^\n]*)(.*?)(ITEM\s+\d+[A-Z]?)'
    match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(2).strip()
    return None

In [8]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [23]:
pip install torch torchvision torchaudio

18283.26s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install torch torchvision torchaudio

In [9]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [13]:
from transformers import pipeline

finbert = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [10]:
from collections import Counter

def analyze_sentiment(text):
    chunks = chunk_text(text)
    labels = []

    for chunk in chunks:
        result = finbert(chunk[:512])[0]  # Truncate to 512 tokens
        labels.append(result["label"])

    # Aggregate: count the most common sentiment
    count = Counter(labels)
    dominant_sentiment = count.most_common(1)[0][0]
    
    return dominant_sentiment, dict(count)

In [11]:
df_1000 = df.head(1000)

In [14]:
# Assuming df_1000 contains a 'cleaned_text' column
df_1000["sentiment_result"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[0])
df_1000["sentiment_breakdown"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1000["sentiment_result"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1000["sentiment_breakdown"] = df_1000["cleaned_text"].apply(lambda x: analyze_sentiment(x)[1])


In [16]:
df_1000.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,sentiment_result,sentiment_breakdown
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,neutral,"{'neutral': 72, 'positive': 11, 'negative': 16}"
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,neutral,"{'neutral': 27, 'positive': 2, 'negative': 3}"
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,neutral,"{'neutral': 101, 'positive': 4, 'negative': 9}"
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,neutral,"{'neutral': 144, 'positive': 13, 'negative': 14}"
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,neutral,"{'neutral': 102, 'positive': 6, 'negative': 9}"


In [17]:
df_1000.to_csv("10K_with_sentiment.csv", index=False)
#df_1000[["file_name", "sentiment_result", "sentiment_breakdown"]].head()

## Using LM lexicon

In [1]:
import pandas as pd
df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/10K_filings_all_years.csv")  #5m 33sec

In [2]:
df.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...


In [3]:
def basic_cleaner(text):
    if pd.isna(text):
        return ""

    # Remove HTML/XML tags
    text = re.sub(r'<[^>]+>', ' ', text)

    # Remove SEC headers
    text = re.sub(r'(?s)<SEC-Header>.*?</SEC-Header>', ' ', text)
    text = re.sub(r'(?s)<Header>.*?</Header>', ' ', text)

    # Remove file meta like "file name xyz.txt"
    text = re.sub(r'(?i)file name.*?\.txt', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+', ' ', text)

    # Remove excessive punctuation and underscores
    text = re.sub(r'[_*=-]{2,}', ' ', text)

    # Collapse multiple newlines and spaces
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s{2,}', ' ', text)

    return text.strip()

In [9]:
df_100=df.head(100)

In [10]:

# Apply initial cleaning
df_100["cleaned_text"] = df_100["text"].apply(basic_cleaner) #10min 32.4

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_100["cleaned_text"] = df_100["text"].apply(basic_cleaner) #10min 32.4


In [10]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [11]:
# Download stopwords
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nareshchethala/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Final preprocessing for LM-based sentiment
def clean_for_lm_modeling(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = text.encode("ascii", "ignore").decode()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\d+", " ", text)
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

# Apply to cleaned_text
df_100["normalized_text"] = df_100["cleaned_text"].apply(clean_for_lm_modeling)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_100["normalized_text"] = df_100["cleaned_text"].apply(clean_for_lm_modeling)


In [13]:
df_100.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...


In [None]:
#df_1000 = df.head(1000).copy()

In [None]:
#df_1000.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...


In [14]:
lm_df = pd.read_csv("/Users/nareshchethala/Desktop/University/Spring_25/BAIM_660/Project/Loughran-McDonald_MasterDictionary_1993-2024.csv")

In [15]:
# Load dictionary and lowercase words
lm_df["Word"] = lm_df["Word"].str.lower()

In [16]:
display(lm_df.head(10))

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,aardvark,1,755,2.95507e-08,1.945421e-08,4.078069e-06,140,0,0,0,0,0,0,0,0,2,12of12inf
1,aardvarks,2,3,1.1742e-10,8.060019e-12,8.919011e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,abaci,3,9,3.5226e-10,1.089343e-10,5.105359e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,aback,4,29,1.13506e-09,6.197922e-10,1.539279e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,abacus,5,9620,3.765268e-07,3.825261e-07,3.421836e-05,1295,0,0,0,0,0,0,0,0,3,12of12inf
5,abacuses,6,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,4,12of12inf
6,abaft,7,4,1.5656e-10,2.144787e-11,2.373367e-08,1,0,0,0,0,0,0,0,0,2,12of12inf
7,abalone,8,149,5.83186e-09,4.729504e-09,1.031859e-06,52,0,0,0,0,0,0,0,0,4,12of12inf
8,abalones,9,1,3.914e-11,7.715206e-11,8.537449e-08,1,0,0,0,0,0,0,0,0,4,12of12inf
9,abandon,10,154158,6.033745e-06,4.824004e-06,3.261271e-05,76324,2009,0,0,0,0,0,0,0,3,12of12inf


In [None]:
# Mapping column names to simplified category names
category_columns = {
    "positive": "Positive",
    "negative": "Negative",
    "uncertainty": "Uncertainty",
    "litigious": "Litigious",
    "strong_modal": "Strong_Modal",
    "weak_modal": "Weak_Modal",
    "constraining": "Constraining"
}

# Create a dictionary of category sets
lm_lexicons = {
    category: set(lm_df[lm_df[col] > 0]["Word"].str.lower())
    for category, col in category_columns.items()
}

In [19]:
# Mapping column names to simplified category names
category_columns = {
    "positive": "Positive",
    "negative": "Negative",
}

# Create a dictionary of category sets
lm_lexicons = {
    category: set(lm_df[lm_df[col] > 0]["Word"].str.lower())
    for category, col in category_columns.items()
}

In [20]:
import re
from collections import defaultdict

def lm_sentiment_all_classes(text):
    words = re.findall(r'\b\w+\b', text.lower())
    counts = defaultdict(int)

    for word in words:
        for category, word_set in lm_lexicons.items():
            if word in word_set:
                counts[category] += 1

    return dict(counts)

In [21]:
df_100["lm_sentiment_breakdown"] = df_100["normalized_text"].apply(lm_sentiment_all_classes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_100["lm_sentiment_breakdown"] = df_100["normalized_text"].apply(lm_sentiment_all_classes)


In [22]:
df_100.head()

Unnamed: 0,year_folder,quarter_folder,filing_date,year,month,day,filing_type,cik,accession,filename,text,cleaned_text,normalized_text,lm_sentiment_breakdown
0,2022,QTR2,20220520,2022,5,20,10-K,data,849399,20220520_10-K_edgar_data_849399_0000849399-22-...,<Header>\n<FileStats>\n <FileName>20220520_...,20220520_10-K_edgar_data_849399_0000849399-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'positive': 279, 'negative': 1082}"
1,2022,QTR2,20220613,2022,6,13,10-K,data,1857910,20220613_10-K_edgar_data_1857910_0001599916-22...,<Header>\n<FileStats>\n <FileName>20220613_...,20220613_10-K_edgar_data_1857910_0001599916-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'negative': 163, 'positive': 59}"
2,2022,QTR2,20220414,2022,4,14,10-K,data,1855751,20220414_10-K_edgar_data_1855751_0001493152-22...,<Header>\n<FileStats>\n <FileName>20220414_...,20220414_10-K_edgar_data_1855751_0001493152-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'negative': 741, 'positive': 275}"
3,2022,QTR2,20220404,2022,4,4,10-K,data,748790,20220404_10-K_edgar_data_748790_0001575872-22-...,<Header>\n<FileStats>\n <FileName>20220404_...,20220404_10-K_edgar_data_748790_0001575872-22-...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'negative': 1192, 'positive': 349}"
4,2022,QTR2,20220408,2022,4,8,10-K,data,1512927,20220408_10-K_edgar_data_1512927_0001410578-22...,<Header>\n<FileStats>\n <FileName>20220408_...,20220408_10-K_edgar_data_1512927_0001410578-22...,_ k_edgar_data_ _ txt hdr sgml accession numbe...,"{'negative': 507, 'positive': 289}"


In [23]:
df_100[["filename", "lm_sentiment_breakdown"]].head(10)

Unnamed: 0,filename,lm_sentiment_breakdown
0,20220520_10-K_edgar_data_849399_0000849399-22-...,"{'positive': 279, 'negative': 1082}"
1,20220613_10-K_edgar_data_1857910_0001599916-22...,"{'negative': 163, 'positive': 59}"
2,20220414_10-K_edgar_data_1855751_0001493152-22...,"{'negative': 741, 'positive': 275}"
3,20220404_10-K_edgar_data_748790_0001575872-22-...,"{'negative': 1192, 'positive': 349}"
4,20220408_10-K_edgar_data_1512927_0001410578-22...,"{'negative': 507, 'positive': 289}"
5,20220401_10-K_edgar_data_1511820_0001493152-22...,"{'negative': 203, 'positive': 39}"
6,20220429_10-K_edgar_data_1676047_0001213900-22...,"{'negative': 897, 'positive': 285}"
7,20220414_10-K_edgar_data_1637866_0001493152-22...,"{'negative': 633, 'positive': 263}"
8,20220413_10-K_edgar_data_1868269_0001104659-22...,"{'negative': 1061, 'positive': 465}"
9,20220412_10-K_edgar_data_1652958_0001683168-22...,"{'negative': 822, 'positive': 147}"


In [27]:
df_1000.to_csv("10K_with_lm_sentiment_1000.csv", index=False)

In [24]:
def dominant_lm_category(counts_dict):
    if not counts_dict:
        return "NEUTRAL"
    return max(counts_dict.items(), key=lambda x: x[1])[0].upper()

df_100["lm_dominant_sentiment"] = df_100["lm_sentiment_breakdown"].apply(dominant_lm_category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_100["lm_dominant_sentiment"] = df_100["lm_sentiment_breakdown"].apply(dominant_lm_category)


In [25]:
df_100[["filename", "lm_dominant_sentiment"]].head(10)

Unnamed: 0,filename,lm_dominant_sentiment
0,20220520_10-K_edgar_data_849399_0000849399-22-...,NEGATIVE
1,20220613_10-K_edgar_data_1857910_0001599916-22...,NEGATIVE
2,20220414_10-K_edgar_data_1855751_0001493152-22...,NEGATIVE
3,20220404_10-K_edgar_data_748790_0001575872-22-...,NEGATIVE
4,20220408_10-K_edgar_data_1512927_0001410578-22...,NEGATIVE
5,20220401_10-K_edgar_data_1511820_0001493152-22...,NEGATIVE
6,20220429_10-K_edgar_data_1676047_0001213900-22...,NEGATIVE
7,20220414_10-K_edgar_data_1637866_0001493152-22...,NEGATIVE
8,20220413_10-K_edgar_data_1868269_0001104659-22...,NEGATIVE
9,20220412_10-K_edgar_data_1652958_0001683168-22...,NEGATIVE


In [28]:
df_100.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   year_folder             100 non-null    int64 
 1   quarter_folder          100 non-null    object
 2   filing_date             100 non-null    int64 
 3   year                    100 non-null    int64 
 4   month                   100 non-null    int64 
 5   day                     100 non-null    int64 
 6   filing_type             100 non-null    object
 7   cik                     100 non-null    object
 8   accession               100 non-null    int64 
 9   filename                100 non-null    object
 10  text                    100 non-null    object
 11  cleaned_text            100 non-null    object
 12  normalized_text         100 non-null    object
 13  lm_sentiment_breakdown  100 non-null    object
 14  lm_dominant_sentiment   100 non-null    object
dtypes: int6

In [30]:
df_100["lm_dominant_sentiment"].describe()

count          100
unique           1
top       NEGATIVE
freq           100
Name: lm_dominant_sentiment, dtype: object