<a href="https://colab.research.google.com/github/JayaKrishanS/Table-Classification-from-Financial-Statements/blob/main/model_building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Libraries**

In [1]:
#importing necessary libraries

import pandas as pd
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

**Loading Data**

In [3]:
#Loading dataset

dataset = pd.read_csv('/content/Extracted_words.csv')
dataset

Unnamed: 0,Words,Category
0,As at As at Particulars Audited Audited A ASSE...,Balance Sheets
1,Statement of assets and liabilities Particular...,Balance Sheets
2,No Particulars Standalone Consolidated for the...,Balance Sheets
3,Particulars As atMarch As atMarch Audited Audi...,Balance Sheets
4,i Borrowings ii Other financial liabilities b ...,Balance Sheets
...,...,...
2520,Symbol Typeof security Book Closure bothdays i...,Others
2521,Subsidiaries Subsidiaries Subsidiaries Country...,Others
2522,Year Ended Description Audited Commercial Vehi...,Others
2523,Quarter Ended Year Ended Particulars Mar Mar Y...,Others


**Data Preparation**

In [4]:
#Checking for null values
dataset.isna().sum()

Words       22
Category     0
dtype: int64

In [5]:
#Droping null values
dataset.dropna(axis=0, inplace=True)

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2503 entries, 0 to 2524
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Words     2503 non-null   object
 1   Category  2503 non-null   object
dtypes: object(2)
memory usage: 58.7+ KB


In [7]:
## Replacing categorical values with numerical values
dataset.loc[dataset['Category'] == 'Balance Sheets','Category',] = 0
dataset.loc[dataset['Category'] == 'Cash Flow','Category',] = 1
dataset.loc[dataset['Category'] == 'Income Statement','Category',] = 2
dataset.loc[dataset['Category'] == 'Notes','Category',] = 3
dataset.loc[dataset['Category'] == 'Others','Category',] = 4

In [8]:
#Converting the columns type
dataset['Words'] = dataset['Words'].astype(str)
dataset['Category'] = dataset['Category'].astype(int)

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2503 entries, 0 to 2524
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Words     2503 non-null   object
 1   Category  2503 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 58.7+ KB


In [10]:
#Converting the words into lower case (Because NLP techniques are case sensitive)
dataset['Words'] = dataset['Words'].str.lower()

In [11]:
#Removing words less than having three letter
dataset['Words'] = dataset['Words'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))

In [12]:
#Applying lemmatization to reduces each word to its base or dictionary form

lemmatizer = WordNetLemmatizer()

dataset['Words'] = dataset['Words'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [13]:
dataset

Unnamed: 0,Words,Category
0,particular audited audited asset current asset...,0
1,statement asset liability particular march mar...,0
2,particular standalone consolidated year ended ...,0
3,particular atmarch atmarch audited audited equ...,0
4,borrowing other financial liability provision ...,0
...,...,...
2520,symbol typeof security book closure bothdays i...,4
2521,subsidiary subsidiary subsidiary country ofinc...,4
2522,year ended description audited commercial vehi...,4
2523,quarter ended year ended particular growth gro...,4


**Feature Engineering and Feature Selection**

In [14]:
#Features and targets
x = dataset['Words']
y = dataset['Category']

In [15]:
#Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
x.shape,x_train.shape,x_test.shape

((2503,), (2002,), (501,))

In [33]:
#Initialize the TfidfVectorizer for feature extraction
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

#Transforming text data into TF-IDF features
x_train_features = feature_extraction.fit_transform(x_train)
x_test_features = feature_extraction.transform(x_test)

**Model Building**

In [34]:
classification_models = [
    RandomForestClassifier(),
    GradientBoostingClassifier(),
    SVC(),
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(),
    xgb.XGBClassifier()
]

results = []

for model in classification_models:
    model_name = model.__class__.__name__
    model.fit(x_train_features, y_train)
    y_pred = model.predict(x_test_features)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({'Model': model_name, 'Accuracy': accuracy})

results_df = pd.DataFrame.from_records(results)
results_df.sort_values(by='Accuracy', ascending=False, inplace=True)
results_df

Unnamed: 0,Model,Accuracy
2,SVC,0.948104
0,RandomForestClassifier,0.94012
5,XGBClassifier,0.938124
1,GradientBoostingClassifier,0.928144
4,DecisionTreeClassifier,0.906188
3,KNeighborsClassifier,0.806387


**Cross Validating Top 2 Models**

In [36]:
# Initialize the Random Forest classifier
rf = RandomForestClassifier()

# Number of folds for cross-validation
n_folds = 5

# StratifiedKFold object for cross-validation
cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Performing cross-validation
cv_scores = cross_val_score(rf, x_train_features, y_train, cv=cv, scoring='accuracy')

# Printing cross-validation scores
print("Cross-validation scores:", cv_scores)

# Calculating the mean and standard deviation of the cross-validation scores
mean_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

print("Mean Cross-validation score:", mean_cv_score)
print("Standard Deviation of Cross-validation score:", std_cv_score)


Cross-validation scores: [0.92518703 0.94264339 0.935      0.95       0.925     ]
Mean Cross-validation score: 0.9355660847880299
Standard Deviation of Cross-validation score: 0.009778686074895715


In [31]:
# Initialize the SVM classifier
SVM_model = SVC()

# Number of folds for cross-validation
n_folds = 5

# StratifiedKFold object for cross-validation
cv_SVM = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Performing cross-validation
cv_scores_SVM = cross_val_score(SVM_model, x_train_features, y_train, cv=cv_SVM, scoring='accuracy')

# Printing the cross-validation scores
print("Cross-validation scores:", cv_scores_SVM)

# Calculating the mean and standard deviation of the cross-validation scores
mean_cv_score_SVM = cv_scores_SVM.mean()
std_cv_score_SVM = cv_scores_SVM.std()

print("Mean Cross-validation score:", mean_cv_score_SVM)
print("Standard Deviation of Cross-validation score:", std_cv_score_SVM)

Cross-validation scores: [0.91770574 0.94014963 0.935      0.945      0.9125    ]
Mean Cross-validation score: 0.9300710723192018
Standard Deviation of Cross-validation score: 0.012730974714387562


In [22]:
#Model

model = RandomForestClassifier()

In [23]:
model.fit(x_train_features, y_train)

In [24]:
y_pred_Rf = model.predict(x_test_features)

In [25]:
print("Accuracy : ", accuracy_score(y_test, y_pred_Rf))

Accuracy :  0.9441117764471058


In [26]:
print(classification_report(y_test, y_pred_Rf))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        53
           1       1.00      1.00      1.00         7
           2       0.98      0.97      0.98        61
           3       0.94      0.90      0.92       134
           4       0.93      0.96      0.94       246

    accuracy                           0.94       501
   macro avg       0.96      0.96      0.96       501
weighted avg       0.94      0.94      0.94       501



**Pickling the model for furthur application**

In [32]:
file = open("model.pkl","wb")
pickle.dump(model,file)
file.close()

In [28]:
file2 = open("vectorizer.pkl","wb")
pickle.dump(feature_extraction, file2)
file.close()