Collecting data from the tables

In [None]:
#Importing libraries
import pandas as pd
import os
import re
from bs4 import BeautifulSoup

In [None]:
#Functions for Extracting the words from html file

def extract_words_from_html(file_path):
    """Extract words from a single HTML file, excluding numbers."""
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Parse the HTML content and extract all text
    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text()

    # Clean the extracted text to keep only words and exclude numbers
    words = re.findall(r'\b[a-zA-Z]+\b', text)

    return words

def extract_words_from_directory(directory_path):
    """Extract words from all HTML files in the specified directory."""
    all_documents = []
    categories = []

    # List all files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".html"):
            file_path = os.path.join(directory_path, filename)
            document = extract_words_from_html(file_path)
            category = os.path.basename(os.path.dirname(file_path))  # Use parent directory name as category
            all_documents.append(' '.join(document))  # Join words into a single string
            categories.append(category)

    return all_documents, categories

In [None]:
directory_paths = [
    '/content/drive/MyDrive/FinacPlus_Assignment/data/data/Balance Sheets',
    '/content/drive/MyDrive/FinacPlus_Assignment/data/data/Cash Flow',
    '/content/drive/MyDrive/FinacPlus_Assignment/data/data/Income Statement',
    '/content/drive/MyDrive/FinacPlus_Assignment/data/data/Notes',
    '/content/drive/MyDrive/FinacPlus_Assignment/data/data/Others'
]

# Initializing empty lists to store dataframes
dfs = []

# Extracting words from each directory and store in a dataframe
for directory_path in directory_paths:
    documents, categories = extract_words_from_directory(directory_path)
    df = pd.DataFrame({
        'Words': documents,
        'Category': categories
    })
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
final_df = pd.concat(dfs, ignore_index=True)

In [None]:
final_df

Unnamed: 0,Words,Category
0,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
1,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
2,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
3,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
4,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
...,...,...
2520,to to Molecule For Country Femarelle Food Supp...,Others
2521,to to Molecule For Country Femarelle Food Supp...,Others
2522,to to Molecule For Country Femarelle Food Supp...,Others
2523,to to Molecule For Country Femarelle Food Supp...,Others


In [None]:
csv_path = 'Extracted_words.csv'
final_df.to_csv(csv_path, index = False)