In [1]:
!pip install fitz



In [2]:
!pip install frontend



In [3]:
!pip install PyMuPDF




In [4]:
import fitz  # PyMuPDF for reading PDFs
import pandas as pd  # Pandas for handling data
import re  # Regular expression for text parsing

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    document = fitz.open(file_path)
    text = ""
    for page in document:
        text += page.get_text()
    document.close()
    return text

# Function to parse financial data from text
def parse_financial_data(text):
    pattern = re.compile(r'(\d{1,2}\.\d+)\s+([\w\s]+)\s+(\d+,\d+|\d+)\s+(\d+,\d+|\d+)')
    matches = pattern.findall(text)
    data = pd.DataFrame(matches, columns=["Note No.", "Description", "Amount 2021", "Amount 2020"])
    return data

# Function to process a PDF file and save the data to an Excel file
def process_pdf_to_excel(pdf_path, excel_path):
    text = extract_text_from_pdf(pdf_path)
    financial_data = parse_financial_data(text)
    financial_data.to_excel(excel_path, index=False)

# Example usage
pdf_path = '/content/Infosys_2021.pdf'
excel_path = '/content/sample_data/2021.xlsx'
process_pdf_to_excel(pdf_path, excel_path)


In [9]:
# @title Install libraries for setting up for tarning using LLM
# Install transformers and any other necessary libraries
!pip install transformers torch --upgrade




In [5]:
# @title Import Libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Check if GPU is available and set it as the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [6]:
# @title Load a Pre-trained Model and Tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move the model to the GPU if available
model.to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [7]:
# @title Load data
import pandas as pd

# Paths to the Excel files
file_paths = [
    '/content/Infosys_Financial_Data_2021.xlsx',
    '/content/Infosys_Financial_Data_2022.xlsx',
    '/content/Infosys_Financial_Data_2023.xlsx',
    '/content/SA_FY23_Annual_Financial_Statement.xlsx'
]

# Load each file into a DataFrame
dataframes = [pd.read_excel(path) for path in file_paths]

# Optionally combine them into a single DataFrame if needed
full_data = pd.concat(dataframes, ignore_index=True)



In [8]:
# @title Preprocess the Data
# Convert 'Amount 2021' to numeric, removing commas
full_data['Amount 2021'] = pd.to_numeric(full_data['Amount 2021'].replace(',', '', regex=True))

# Now create labels
labels = full_data['Amount 2021'].apply(lambda x: 1 if x > 1000 else 0).tolist()  # Binary classification example


In [9]:
# Assume 'Description' is a column with text data and 'Amount 2021' is the feature to predict or classify upon
texts = full_data['Description'].tolist()
labels = full_data['Amount 2021'].apply(lambda x: 1 if x > 1000 else 0).tolist()  # Example binary classification based on amount


In [10]:
# @title Encode the Data
# Assume tokenizer is already loaded
encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
labels = torch.tensor(labels).to(device)


In [11]:
# @title Create Dataset and DataLoader
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Create a dataset
dataset = TensorDataset(input_ids, attention_mask, labels)

# Create a dataloader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [12]:
# @title Define the Training Loop
from torch.optim import AdamW
from torch.utils.data import DataLoader, RandomSampler

# Assuming the model and tokenizer have been loaded and set up as described earlier

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
model.train()  # Set the model to training mode
for epoch in range(4):  # Let's train for 4 epochs
    total_loss = 0
    for batch in dataloader:
        b_input_ids, b_attention_mask, b_labels = batch

        # Clear any previously calculated gradients before performing a backward pass
        model.zero_grad()

        # Perform a forward pass. The model outputs the loss and logits when given labels
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Perform a backward pass to calculate gradients
        loss.backward()

        # Update parameters
        optimizer.step()

    # After the completion of each epoch, print the average loss
    print(f'Epoch {epoch + 1} | Average Loss: {total_loss / len(dataloader)}')


Epoch 1 | Average Loss: 0.6321606899009031
Epoch 2 | Average Loss: 0.5715881586074829
Epoch 3 | Average Loss: 0.5523177262614755
Epoch 4 | Average Loss: 0.539220021051519


In [13]:
# @title Evaluate the Model
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (assuming 'full_data' is your complete dataset)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    full_data['Description'],
    full_data['Amount 2021'].apply(lambda x: 1 if x > 1000 else 0),
    test_size=0.1  # 10% of the data for testing
)

# Tokenize the test data
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=128)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.tolist())
)
test_loader = DataLoader(test_dataset, batch_size=8)


In [14]:
import numpy as np

In [15]:
# Evaluate the model
model.eval()  # Set the model to evaluation mode

total_eval_accuracy = 0
total_eval_loss = 0

for batch in test_loader:
    b_input_ids, b_attention_mask, b_labels = batch
    b_input_ids = b_input_ids.to(device)
    b_attention_mask = b_attention_mask.to(device)
    b_labels = b_labels.to(device)

    with torch.no_grad():  # No need to compute gradients in the evaluation phase
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)

    # Move logits and labels to CPU
    logits = outputs.logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculate the loss and accuracy
    loss = outputs.loss
    total_eval_loss += loss.item()

    # Calculate the accuracy for this batch of test sentences, and accumulate it over all batches.
    preds = np.argmax(logits, axis=1)
    total_eval_accuracy += np.sum(preds == label_ids)

# Report the final accuracy for this test run.
avg_test_accuracy = total_eval_accuracy / len(test_loader.dataset)
print("  Accuracy: {0:.2f}".format(avg_test_accuracy))
print("  Test Loss: {0:.2f}".format(total_eval_loss / len(test_loader)))


  Accuracy: 0.64
  Test Loss: 0.69


In [16]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# @title Saving the Model
model_path = '/content/drive/My Drive/My Models/bert_model'
tokenizer_path = '/content/drive/My Drive/My Models/bert_model'

model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)


('/content/drive/My Drive/My Models/bert_model/tokenizer_config.json',
 '/content/drive/My Drive/My Models/bert_model/special_tokens_map.json',
 '/content/drive/My Drive/My Models/bert_model/vocab.txt',
 '/content/drive/My Drive/My Models/bert_model/added_tokens.json',
 '/content/drive/My Drive/My Models/bert_model/tokenizer.json')

In [37]:
!pip install transformers torch --upgrade


Collecting transformers
  Downloading transformers-4.43.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting torch
  Downloading torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12

In [18]:
import torch

# Explicitly save the PyTorch model state dictionary
torch.save(model.state_dict(), './model/pytorch_model.bin')

# Check if the file is saved correctly
!ls -l ./model


total 856388
-rw-r--r-- 1 root root       727 Aug  6 18:25 config.json
-rw-r--r-- 1 root root 437958648 Aug  6 18:25 model.safetensors
-rw-r--r-- 1 root root 438015150 Aug  6 18:36 pytorch_model.bin
-rw-r--r-- 1 root root       125 Aug  6 18:25 special_tokens_map.json
-rw-r--r-- 1 root root      1190 Aug  6 18:25 tokenizer_config.json
-rw-r--r-- 1 root root    711649 Aug  6 18:25 tokenizer.json
-rw-r--r-- 1 root root    231508 Aug  6 18:25 vocab.txt


In [19]:
# @title Download in local machine
from google.colab import files
files.download('./model/config.json')
files.download('./model/pytorch_model.bin')
files.download('./model/tokenizer_config.json')
files.download('./model/vocab.txt')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
import shutil
import os

model_dir = './model'
drive_path = '/content/drive/My Drive/My Models/bert_model'

# Copy files from the local Colab environment to Google Drive, overwriting if necessary
shutil.copytree(model_dir, drive_path, dirs_exist_ok=True)


'/content/drive/My Drive/My Models/bert_model'