<a href="https://colab.research.google.com/github/KarthikAlagarsamy/FinacPlus/blob/main/Karthik_FinacPlus_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Table Classification from Financial Statements**

Kindly upload [Finacdata.zip](https://drive.google.com/file/d/1bR10UNwYTvVcsgJr-ZLgfSjwboJBNoaz/view?usp=sharing) file before running this colab file

In [None]:
# Import necessary libraries

import os                                                               # For interacting with operating system
import pandas as pd                                                     # For data manipulation and analysis
from bs4 import BeautifulSoup                                           # For parsing HTML documents
from sklearn.feature_extraction.text import TfidfVectorizer             # For text vectorization
from sklearn.model_selection import train_test_split, cross_val_score   # For splitting data and cross-validation
from sklearn.ensemble import RandomForestClassifier                     # For Random Forest model
from sklearn.metrics import accuracy_score, classification_report       # For model evaluation metrics
import re                                                               # For regular expressions
from nltk.corpus import stopwords                                       # For stop words
from nltk.stem import WordNetLemmatizer                                 # For word lemmatization
import nltk                                                             # For natural language processing

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))                            # Create a set of English stop words
lemmatizer = WordNetLemmatizer()                                        # Initialize a lemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Unzip dataset (Finacdata.zip)
!unzip Finacdata.zip -d /content/Finac

Archive:  Finacdata.zip
   creating: /content/Finac/data/Balance Sheets/
  inflating: /content/Finac/data/Balance Sheets/18320959_3.html  
  inflating: /content/Finac/data/Balance Sheets/18391125_2.html  
  inflating: /content/Finac/data/Balance Sheets/18442877_5.html  
  inflating: /content/Finac/data/Balance Sheets/18445487_2.html  
  inflating: /content/Finac/data/Balance Sheets/18445494_3.html  
  inflating: /content/Finac/data/Balance Sheets/18448274_5.html  
  inflating: /content/Finac/data/Balance Sheets/18448275_3.html  
  inflating: /content/Finac/data/Balance Sheets/18448275_9.html  
  inflating: /content/Finac/data/Balance Sheets/18456477_2.html  
  inflating: /content/Finac/data/Balance Sheets/18456478_2.html  
  inflating: /content/Finac/data/Balance Sheets/18456478_4.html  
  inflating: /content/Finac/data/Balance Sheets/18460640_10.html  
  inflating: /content/Finac/data/Balance Sheets/18460640_5.html  
  inflating: /content/Finac/data/Balance Sheets/18460658_19.html  
 

In [None]:
# Dataset path in Google Drive
dataset_path = '/content/Finac/data'

# Categories of folder names
categories = ['Income Statement', 'Balance Sheets', 'Cash Flow', 'Notes', 'Others']

In [None]:
# Initialize lists to hold cleaned text data and labels
data = []
labels = []


# Load and preprocess data from each category
for category in categories:
    folder_path = os.path.join(dataset_path, category)            # Path to category folder

    for filename in os.listdir(folder_path):                      # Iterate over each file in category folder
        if filename.endswith('.html'):                            # Process only HTML files
            file_path = os.path.join(folder_path, filename)       # Defining full file path

            with open(file_path, 'r', encoding='utf-8') as file:  # Open and read the file
                soup = BeautifulSoup(file, 'html.parser')         # Parse HTML content
                tables = soup.find_all('table')                   # Extract all tables in HTML file

                for table in tables:                                  # Iterate over each table
                    df = pd.read_html(str(table))[0]                  # Convert HTML table to DataFrame
                    text = ' '.join(df.astype(str).values.flatten())  # Convert table to text

                    # Clean the text
                    text = re.sub(r'\d+|[^\w\s]', '', text)           # Remove digits and punctuation
                    text = text.lower()                               # Convert text to lowercase

                    words = text.split()                                                              # Split text into words
                    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords and lemmatize
                    cleaned_text = ' '.join(words)                                                    # Rejoin words into cleaned text

                    data.append(cleaned_text)                       # Append cleaned text to data list
                    labels.append(category)                         # Append category label to labels list

In [None]:
# Create DataFrame from data and labels
df = pd.DataFrame({"Text": data, "Label": labels})
df

Unnamed: 0,Text,Label
0,nan nan quarter ended quarter ended quarter en...,Income Statement
1,si nan year ended year ended particular nan au...,Income Statement
2,india limited regd office electronics city hos...,Income Statement
3,nan nan consolidatec consolidatec consolidatec...,Income Statement
4,nan nan nan nan standalone nan nan particular ...,Income Statement
...,...,...
2520,consolidated financial result edelweiss financ...,Others
2521,standalone company result standalone company r...,Others
2522,manager listing department national stock exch...,Others
2523,nan quarter bided quarter bided quarter bided ...,Others


In [None]:
# Vectorize the cleaned text data using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [None]:
# Initialize a Random Forest classifier
classification = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1)
classification

In [None]:
# Train the classifier using training dataset
classification.fit(X_train, y_train)

# Make predictions on the test dataset
y_pred = classification.predict(X_test)

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

# Calculate Cross-validation accuracy
cv_scores = cross_val_score(classification, X, labels, cv=5)
print(f'Cross-validation accuracy: {cv_scores.mean()} ± {cv_scores.std()}')

Accuracy: 0.9445544554455445
                  precision    recall  f1-score   support

  Balance Sheets       0.98      1.00      0.99        58
       Cash Flow       1.00      0.88      0.93         8
Income Statement       0.98      0.91      0.95        57
           Notes       0.92      0.91      0.91       123
          Others       0.94      0.96      0.95       259

        accuracy                           0.94       505
       macro avg       0.96      0.93      0.95       505
    weighted avg       0.94      0.94      0.94       505

Cross-validation accuracy: 0.9358415841584158 ± 0.002963689019226845


# *Classify new financial statements*

In [None]:
# List of financial statement HTML files for prediction
check_files = [
    '/content/Finac/data/Income Statement/18448274_3.html',
    '/content/Finac/data/Notes/18599651_table_124.html'
              ]

# Initialize list to hold cleaned text data for new financial statements
check_data = []

for filename in check_files:                                  # Iterate over each file in the list of files to be checked
    if filename.endswith('.html'):                            # Process only HTML files
        file_path = os.path.join(folder_path, filename)       # Defining full file path

        with open(file_path, 'r', encoding='utf-8') as file:  # Open and read the file
            soup = BeautifulSoup(file, 'html.parser')         # Parse HTML content
            tables = soup.find_all('table')                   # Extract all tables in the HTML file

            for table in tables:                                  # Iterate over each table
                df = pd.read_html(str(table))[0]                  # Convert HTML table to DataFrame
                text = ' '.join(df.astype(str).values.flatten())  # Convert table to text

                # Clean the text
                text = re.sub(r'\d+', '', text)               # Remove digits
                text = re.sub(r'[^\w\s]', '', text)           # Remove punctuation
                text = text.lower()                           # Convert text to lowercase

                words = text.split()                                                              # Split text into words
                words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Remove stopwords and lemmatize
                cleaned_text = ' '.join(words)                                                    # Rejoin words into cleaned text

                check_data.append(cleaned_text)                                                   # Append cleaned text to check data list

# Vectorize the new cleaned text data using the existing vectorizer
encoded_check_data = vectorizer.transform(check_data)

# Predict the categories of the check_files using the trained Random Forest classifier
predictions = classification.predict(encoded_check_data)

# Print the predictions for each check_files
for file, prediction in zip(check_files, predictions):
    print(f'The check file named " {file} " is classified as: {prediction}')

The check file named " /content/Finac/data/Income Statement/18448274_3.html " is classified as: Income Statement
The check file named " /content/Finac/data/Notes/18599651_table_124.html " is classified as: Notes
