### Importing all the required libraries

In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd
import string
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, accuracy_score

### Function to extract text contents from the given HTML files

In [2]:
# Function to extract text from HTML file
def extract_text_from_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table')
        table_data = []
        for table in tables:
            rows = table.find_all('tr')
            for row in rows:
                cols = row.find_all(['td', 'th'])
                cols = [ele.text.strip() for ele in cols]
                table_data.append(cols)
        flattened_list = [item for sublist in table_data for item in sublist]
        return ' '.join(flattened_list)

### Function to code the categories with numbers

In [3]:
#function for coding the categories
def category_coding(category):
    return 0 if category == "Balance Sheets" else (1 if category == "Cash Flow" else (2 if category=="Income Statement" else(3 if category=="Notes" else 4)))

### Extracting the texts from the files

In [4]:
# Root directory path where all folders are located
root_directory = '/Users/milan/Desktop/data'

# Initialize an empty list to store dataframes
dfs = []

# Iterate over each folder
for folder_name in os.listdir(root_directory):
    folder_path = os.path.join(root_directory, folder_name)
    
    # Check if the item is a directory
    if os.path.isdir(folder_path):
        # Iterate over each HTML file in the folder
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.html'):
                file_path = os.path.join(folder_path, file_name)
                # Extract text from HTML file
                extracted_text = extract_text_from_html(file_path)
                # Create a dataframe for the current HTML file
                df = pd.DataFrame([[extracted_text, folder_name]], columns=["extracted_text", "category"])
                dfs.append(df)

# Concatenate all dataframes into a single dataframe
final_df = pd.concat(dfs, ignore_index=True)
final_df['category'] = final_df['category'].apply(category_coding)

### Preprocessing the data

In [5]:
def remove_noise(text):
    # Defining the special characters to remove
    special_chars = '!@#$%^&*()_+<>?/\|.,:;"{}[]`~'

    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation + special_chars))

    # Removing newline characters
    text = text.replace('\n', '')

    return text
final_df['extracted_text'] = final_df['extracted_text'].apply(remove_noise)

### Lowering all the letters in the extracted text

In [6]:
# Applying the wordtokenize instance to convert the sentence into tokens
final_df['tokens'] = final_df['extracted_text'].apply(lambda x: word_tokenize(x))

In [7]:
# Initialising the WordNetLemmatizer instance to convert all the tokens into it's original form
lemmatizer = WordNetLemmatizer()

#lemmatizing the tokens and joining the tokens into sentences
final_df['lemmatized_text'] = final_df['tokens'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [8]:
# Final dataframe before splitting into training and testing data.
# Here we are not lowering and removing the stopwords as TfidfVectorizer will handle all these things
final_df

Unnamed: 0,extracted_text,category,tokens,lemmatized_text
0,Cash flows from financing activities Decrea...,1,"[Cash, flows, from, financing, activities, Dec...",Cash flow from financing activity Decrease Inc...
1,Year ended31st March 2018 Year ended31st Marc...,1,"[Year, ended31st, March, 2018, Year, ended31st...",Year ended31st March 2018 Year ended31st March...
2,PARTICULARS As on 31032017 As on 31032016 Net ...,1,"[PARTICULARS, As, on, 31032017, As, on, 310320...",PARTICULARS As on 31032017 As on 31032016 Net ...
3,DESCRIPTION Year Ended March 31 2017 Year Ende...,1,"[DESCRIPTION, Year, Ended, March, 31, 2017, Ye...",DESCRIPTION Year Ended March 31 2017 Year Ende...
4,INR in Crores Particulars For the year ended ...,1,"[INR, in, Crores, Particulars, For, the, year,...",INR in Crores Particulars For the year ended F...
...,...,...,...,...
2520,Outstanding Balances Million Name of the Re...,3,"[Outstanding, Balances, Million, Name, of, the...",Outstanding Balances Million Name of the Relat...
2521,Particulars December 31 2017 December 31 2016 ...,3,"[Particulars, December, 31, 2017, December, 31...",Particulars December 31 2017 December 31 2016 ...
2522,13 Cash and cash equivalents Rupees in Millio...,3,"[13, Cash, and, cash, equivalents, Rupees, in,...",13 Cash and cash equivalent Rupees in Million ...
2523,31 December 2017INR in Lacs 31 December 2016I...,3,"[31, December, 2017INR, in, Lacs, 31, December...",31 December 2017INR in Lacs 31 December 2016IN...


### Model is not balanced well and it's a text data so will do oversampling after making them vectors

In [9]:
# Splitting the 70% data for training and 30% data for testing
X_train, X_test, y_train, y_test = train_test_split(final_df['lemmatized_text'], final_df['category'], test_size=0.3, random_state=42)

In [10]:
# Initialising all the classification models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Naive Bayes', MultinomialNB()),
    ('SVM', SVC(probability=True)),
    ("Decision Tree", DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]

for model_name, model in models:
    pipeline = make_pipeline(TfidfVectorizer(), model)
    # Perform cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5)
    print(f'{model_name} Cross-validation accuracy: {scores.mean():.4f}')

Logistic Regression Cross-validation accuracy: 0.9276
Naive Bayes Cross-validation accuracy: 0.8172
SVM Cross-validation accuracy: 0.9349
Decision Tree Cross-validation accuracy: 0.8432
Random Forest Cross-validation accuracy: 0.9236
Gradient Boosting Cross-validation accuracy: 0.9134


In [11]:
best_model_pipeline = make_pipeline(TfidfVectorizer(), SVC(probability=True))
best_model_pipeline.fit(X_train, y_train)
test_accuracy = best_model_pipeline.score(X_test, y_test)
print(f'SVM Test accuracy: {test_accuracy:.4f}')

SVM Test accuracy: 0.9406


In [12]:
# Defining a custom scorer for roc_auc_ovr
scorer_roc_auc_ovr = make_scorer(roc_auc_score, needs_proba=True, multi_class='ovr')

# Perform cross-validation with roc_auc_ovr scoring
for model_name, model in models:
    pipeline = make_pipeline(TfidfVectorizer(), model)
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=scorer_roc_auc_ovr)
    print(f'{model_name} Cross-validation AUROC score: {scores.mean():.4f}')

Logistic Regression Cross-validation AUROC score: 0.9932
Naive Bayes Cross-validation AUROC score: 0.9450
SVM Cross-validation AUROC score: 0.9944
Decision Tree Cross-validation AUROC score: 0.8679
Random Forest Cross-validation AUROC score: 0.9930
Gradient Boosting Cross-validation AUROC score: 0.9904


In [13]:
# Defining the number of folds
n_splits = 10

# Initializing StratifiedKFold
k_fold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Defining the models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Naive Bayes', MultinomialNB()),
    ('SVM', SVC(probability=True)),
    ("Decision Tree", DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]

# Initializing TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Looping through each model
for model_name, model in models:
    print(f'Evaluating {model_name}')
    
    # Initializing a list to store accuracy scores for each fold
    fold_accuracies = []
    
    # Looping through each fold
    for fold, (train_index, test_index) in enumerate(k_fold.split(X_train, y_train)):
        # Split the data into train and test sets
        X_fold_train, X_fold_test = X_train.iloc[train_index], X_train.iloc[test_index]
        y_fold_train, y_fold_test = y_train.iloc[train_index], y_train.iloc[test_index]
        
        # Create a pipeline with TF-IDF Vectorizer and the model
        pipeline = make_pipeline(tfidf_vectorizer, model)
        
        # Fit the model on the training data
        pipeline.fit(X_fold_train, y_fold_train)
        
        # Predict on the test data
        y_pred = pipeline.predict(X_fold_test)
        
        # Calculate accuracy for this fold and store it
        accuracy = accuracy_score(y_fold_test, y_pred)
        fold_accuracies.append(accuracy)
    
    # Calculating and printing the average accuracy across all folds
    average_accuracy = sum(fold_accuracies) / n_splits * 100
    print(f'Average {model_name} Accuracy Percentage: {average_accuracy:.4f} %')

Evaluating Logistic Regression
Average Logistic Regression Accuracy Percentage: 93.0367 %
Evaluating Naive Bayes
Average Naive Bayes Accuracy Percentage: 82.1723 %
Evaluating SVM
Average SVM Accuracy Percentage: 94.1121 %
Evaluating Decision Tree
Average Decision Tree Accuracy Percentage: 85.6815 %
Evaluating Random Forest
Average Random Forest Accuracy Percentage: 93.1481 %
Evaluating Gradient Boosting
Average Gradient Boosting Accuracy Percentage: 92.0740 %


## By conclusion, for our use cause SVM will be the best model to select, because it's giving the highest of accuracy compared to all other models

### To test the model, please give the file path in the below given code

In [14]:
test_file = '/Users/milan/Desktop/data/Others/18582961_8.html'
test_extracted_from_file = extract_text_from_html(test_file)
noise_removed_text = remove_noise(test_extracted_from_file)
tokenised_text = word_tokenize(noise_removed_text)
lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in tokenised_text])
predicted_value = best_model_pipeline.predict([lemmatized_text])
print("Given document is classfied as:","Balance Sheets" if predicted_value == 0 else ("Cash Flow" if predicted_value == 1 else ("Income Statement" if predicted_value == 2 else("Notes" if predicted_value == 3 else "Others"))))

Given document is classfied as: Others
