In [115]:
import os
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [116]:
def list_filenames(directory):
    """
    Lists all HTML filenames in the specified directory and stores them in a DataFrame.
    
    Args:
        directory (str): The path to the directory containing the files.
    
    Returns:
        pd.DataFrame: A DataFrame with two columns, 'filename' and 'EPS', where 'EPS' is initially empty.
    """
    dt = pd.DataFrame(columns=['filename', 'EPS'])
    index = 0
    for filename in os.listdir(directory):
        if filename.endswith('.html'):
            dt.loc[index,'filename'] = filename
            index += 1
    return dt


In [117]:
def add_training_folder(file_name):
    """
    Adds the 'Training_Filings/' folder prefix to the given file name.

    Args:
        file_name (str): The name of the file (e.g., '0000004977-20-000054.html').
    
    Returns:
        str: The new file path with 'Training_Filings/' prefixed.
    """

    new_file_path = f"Training_Filings/{file_name}"
    return new_file_path

In [118]:
def convert_to_float(number):
    """
    Converts a string representation of a number into a float.
    Handles numbers with commas and negative numbers enclosed in parentheses.

    Args:
        number (str): The string representation of the number.
    
    Returns:
        float: The converted float value, negative if enclosed in parentheses.
    """
    number = number.replace(',', '')
    if number.startswith('(') or number.endswith(')'):
        return -float(number.strip('()'))
    else:
        return float(number)

In [119]:
def read_html_file(file_path):
    """
    Reads the contents of an HTML file from the specified file path.
    
    Args:
        file_path (str): The path to the HTML file to be read.
        
    Returns:
        str: The content of the HTML file as a string if the file exists.
             Returns "File Not Found" if the file doesn't exist.
    """
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            return content
    else:
        return "File Not Found"

In [120]:
dt = list_filenames('Training_Filings')

In [121]:
dt.head()

Unnamed: 0,filename,EPS
0,0000004977-20-000054.html,
1,0000008947-20-000044.html,
2,0000046080-20-000050.html,
3,0000066570-20-000013.html,
4,0000314808-20-000062.html,


In [122]:
file_path = ['Training_Filings/0001564590-20-019726.html','Training_Filings/0000066570-20-000013.html','Training_Filings/0000008947-20-000044.html','Training_Filings/0001564590-20-019431.html','Training_Filings/0001564590-20-019396.html']

In [123]:
list_keywords = [r'Basic earnings \(loss\) per share.*?\$\s*(\d+\.\d+)',r'Basic earnings per share.*?\$\s*(\d+\.\d+)',r'\(Loss\) earnings per share.*?\$\s*(\d+\.\d+)',r'Earnings \(loss\) per common share.*?\$\s*(\d+\.\d+)',r'Earnings per share attributable to.*?\$\s*(\d+\.\d+)',r'Diluted earnings per share.*?\$\s*(\d+\.\d+)',r'Diluted earnings \(loss\) per share.*?\$\s*(\d+\.\d+)']

In [124]:
for k in range (5):

    file = file_path[k]

    html_content = read_html_file(file)

    soup = BeautifulSoup(html_content)

    rows = soup.find_all('table')    
    for i in range (len(list_keywords)):
        
        pattern = list_keywords[i]

        for j in range (len(rows)):

            text = rows[j]
            text = text.get_text(separator='')
            text = text.strip()
            
            match = re.search(pattern, text, re.DOTALL)
            if match:
                start_index = match.start()
                
                end_index = start_index + 250
                snippet = text[start_index:end_index]

                number_pattern = r'\(?-?\d{1,3}(?:,\d{3})*(\.\d+)?\)?'

                match = re.search(number_pattern, snippet)
                
                if match:
                    first_number = match.group(0)
                    number = convert_to_float(first_number)
                    print(f"The first number after '{pattern}' and html '{file}' is: {number}")
                    
                break
            
        if match:
            break

The first number after 'Basic earnings \(loss\) per share.*?\$\s*(\d+\.\d+)' and html 'Training_Filings/0001564590-20-019726.html' is: 0.08
The first number after 'Earnings per share attributable to.*?\$\s*(\d+\.\d+)' and html 'Training_Filings/0000066570-20-000013.html' is: 1.12
The first number after 'Earnings \(loss\) per common share.*?\$\s*(\d+\.\d+)' and html 'Training_Filings/0000008947-20-000044.html' is: -0.41
The first number after 'Basic earnings per share.*?\$\s*(\d+\.\d+)' and html 'Training_Filings/0001564590-20-019431.html' is: 1.08
The first number after '\(Loss\) earnings per share.*?\$\s*(\d+\.\d+)' and html 'Training_Filings/0001564590-20-019396.html' is: -3.15


My code works for the HTML files that are in the output_examples. Now, I am going to try applying my code to all 50 press articles. I want to make the code as scalable as possible. 2.1 seconds for 5 articles is fast enough for me to apply it to 50 articles

### Step II: Scaling the code for Processing 50 Press Articles Efficiently

In [125]:
list_keywords_v2 = [r'Net income per share.*?\$\s*(\d+\.\d+)',r'Basic and diluted net \(loss\) income per share.*?\$\s*(\d+\.\d+)',r'Net income per common share.*?\$\s*(\d+\.\d+)',r'Basic [Ee]arnings \([Ll]oss\) [Pp]er [Ss]hare.*?\$\s*(\d+\.\d+)',r'(?i)LOSS PER SHARE - BASIC AND DILUTED.*?\$\s*\(?(-?\d+\.\d+)\)?',r'(?i)NET EARNINGS PER COMMON SHARE - BASIC.*?\$\s*\(?(-?\d+\.\d+)\)?',r'Basic net income per share.*?\$\s*(\d+\.\d+)'
                    ,r'Basic [Ee]arnings [Pp]er [Ss]hare.*?\$\s*(\d+\.\d+)',r'Net income \(loss\) per common share.*?\$\s*(\d+\.\d+)',r'Basic [Ee]arnings [Pp]er [Cc]ommon [Ss]hare.*?\$\s*(\d+\.\d+)',r'\(Loss\) [Ee]arnings [Pp]er [Ss]hare.*?\$\s*(\d+\.\d+)',r'Earnings \([Ll]oss\) [Pp]er [Cc]ommon [Ss]hare.*?\$\s*(\d+\.\d+)',r'Earnings per share attributable.*?\$\s*(\d+\.\d+)'
                    ,r'[Gg][Aa][Aa][Pp] [Nn]et income per share\s*-\s*[Dd]iluted.*?\$\s*(\d+\.\d+)',r'Diluted [Ee]arnings [Pp]er [Ss]hare.*?\$\s*(\d+\.\d+)',r'Diluted [Ee][Pp][Ss].*?\$\s*(\d+\.\d+)',r'Earnings per ordinary share.*?\$\s*(\d+\.\d+)'
                    ,r'Diluted [Ee]arnings \([Ll]oss\) [Pp]er [Ss]hare.*?\$\s*(\d+\.\d+)',r'Diluted [Ee]arnings [Pp]er [Cc]ommon [Ss]hare.*?\$\s*(\d+\.\d+)',r'Net [Ee]arnings [Pp]er [Ss]hare\s*–\s*[Dd]iluted.*?\$\s*(\d+\.\d+)',r'Net income allocated to shareholders per share.*?\$\s*\(?(-?\d+\.\d+)\)?'
                    ,r'Basic and diluted loss per share.*?\$\s*\(?(-?\d+\.\d+)\)?',r'Net loss attribuable to.*?\$\s*(\d+\.\d+)',r'Loss [Pp]er [Cc]ommon [Ss]hare.*?\(\s*(-?\d+\.\d+)\s*\)',r'Net loss per common share.*?\(\s*(-?\d+\.\d+)\s*\)',r'Net \(loss\) income per diluted common share.*?\$\s*(\d+\.\d+)'
                    ,r'Basic and diluted net loss per share.*?\$\s*\(?(-?\d+\.\d+)\)?',r'Net \(loss\) income.*?\$\s*(\d+\.\d+)',r'Earnings per share.*?\$\s*(\d+\.\d+)',r'Net loss per share.*?\$\s*\(?(-?\d+\.\d+)\)?',r'Net \(Loss\) Income.*?\$\s*(\d+\.\d+)']

In [126]:
print(len(list_keywords_v2))

31


In [127]:
dt = list_filenames('Training_Filings')

for k in range (len(dt)):

    file = add_training_folder(dt.loc[k,'filename'])

    html_content = read_html_file(file)

    soup = BeautifulSoup(html_content)

    rows = soup.find_all('table')    
    for i in range (len(list_keywords_v2)):
        
        pattern = list_keywords_v2[i]

        for j in range (len(rows)):

            text = rows[j]
            text = text.get_text(separator='')
            text = text.strip()
            
            match = re.search(pattern, text, re.DOTALL)
            if match:
                start_index = match.start()
                
                end_index = start_index + 300
                snippet = text[start_index:end_index]

                number_pattern = r'\(?-?\d{1,3}(?:,\d{3})*(\.\d+)?\)?'

                match = re.search(number_pattern, snippet)
                
                if match:
                    first_number = match.group(0)
                    number = convert_to_float(first_number)
                    dt.loc[k,'EPS'] = number
                    print(f"The first number for html '{file}' is: {number}")
                    
                break
            
        if match:
            break



The first number for html 'Training_Filings/0000004977-20-000054.html' is: 0.78
The first number for html 'Training_Filings/0000008947-20-000044.html' is: -0.41
The first number for html 'Training_Filings/0000066570-20-000013.html' is: 1.12
The first number for html 'Training_Filings/0000314808-20-000062.html' is: -15.19
The first number for html 'Training_Filings/0000706129-20-000012.html' is: 0.26
The first number for html 'Training_Filings/0000846617-20-000024.html' is: 0.47
The first number for html 'Training_Filings/0000874766-20-000033.html' is: 1.0
The first number for html 'Training_Filings/0000875320-20-000014.html' is: 2.29
The first number for html 'Training_Filings/0000892537-20-000010.html' is: 0.71
The first number for html 'Training_Filings/0000895419-20-000042.html' is: -0.57
The first number for html 'Training_Filings/0000939057-20-000186.html' is: 0.61
The first number for html 'Training_Filings/0000950103-20-008424.html' is: 0.24
The first number for html 'Training_F

In [128]:
dt.isna().sum()

filename    0
EPS         2
dtype: int64

In [129]:
dt.to_csv("output_eps_data.csv")

### Step 3: Simplifying and Generalizing Regular Expressions for Financial Data Extraction

In [135]:
list_keywords_v3 = [

    r'(?i)(?:basic|diluted)?\s*(?:net\s*)?(?:income|loss|earnings)\s*(?:\(loss\))?\s*per\s*(?:common\s*)?share.*?\$\s*\(?(-?\d+\.\d+)\)?',
    
    r'(?i)basic and diluted\s*(?:net\s*)?(?:income|loss) per share.*?\$\s*\(?(-?\d+\.\d+)\)?',
    
    r'(?i)(?:diluted)?\s*[Ee][Pp][Ss].*?\$\s*\(?(-?\d+\.\d+)\)?',
    
    r'(?i)(?:earnings|loss)\s*per\s*(?:ordinary|common)?\s*share.*?\$\s*\(?(-?\d+\.\d+)\)?',
    
    r'(?i)net (?:income|loss) (?:attributable to shareholders|allocated to).*?\$\s*\(?(-?\d+\.\d+)\)?'
]


In [131]:
len(list_keywords_v3)

13

In [136]:
dt = list_filenames('Training_Filings')

for k in range (len(dt)):

    file = add_training_folder(dt.loc[k,'filename'])

    html_content = read_html_file(file)

    soup = BeautifulSoup(html_content)

    rows = soup.find_all('table')    
    for i in range (len(list_keywords_v3)):
        
        pattern = list_keywords_v3[i]

        for j in range (len(rows)):

            text = rows[j]
            text = text.get_text(separator='')
            text = text.strip()
            
            match = re.search(pattern, text, re.DOTALL)
            if match:
                start_index = match.start()
                
                end_index = start_index + 300
                snippet = text[start_index:end_index]

                number_pattern = r'\(?-?\d{1,3}(?:,\d{3})*(\.\d+)?\)?'

                match = re.search(number_pattern, snippet)
                
                if match:
                    first_number = match.group(0)
                    number = convert_to_float(first_number)
                    dt.loc[k,'EPS'] = number
                    print(f"The first number for html '{file}' is: {number}")
                    
                break
            
        if match:
            break



The first number for html 'Training_Filings/0000004977-20-000054.html' is: 0.78
The first number for html 'Training_Filings/0000008947-20-000044.html' is: -0.41
The first number for html 'Training_Filings/0000046080-20-000050.html' is: 29.0
The first number for html 'Training_Filings/0000066570-20-000013.html' is: 1.12
The first number for html 'Training_Filings/0000314808-20-000062.html' is: -15.19
The first number for html 'Training_Filings/0000706129-20-000012.html' is: 31.0
The first number for html 'Training_Filings/0000846617-20-000024.html' is: 31.0
The first number for html 'Training_Filings/0000874766-20-000033.html' is: 1.0
The first number for html 'Training_Filings/0000875320-20-000014.html' is: 2.29
The first number for html 'Training_Filings/0000892537-20-000010.html' is: 0.71
The first number for html 'Training_Filings/0000895419-20-000042.html' is: -0.57
The first number for html 'Training_Filings/0000939057-20-000186.html' is: 0.61
The first number for html 'Training_F

In [137]:
print(dt.isna().sum())

filename    0
EPS         0
dtype: int64


In [138]:
dt.to_csv("output_eps_data_v3.csv")