In [28]:
import PyPDF2
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


def extract_transactions_from_pdf(pdf_file_path):
    transactions = []
    
    with open(pdf_file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            
            # Define patterns for extracting different fields
            date_pattern = r'[A-Z][a-z]{2} \d{1,2}, \d{4}' # Match date in format "Apr 28, 2024"
            type_pattern = r'(DEBIT|CREDIT)'
            time_pattern = r'(\d{2}:\d{2} (?:AM|PM))' 
            amount_pattern = r'₹\d+(?:,\d+)*\.?\d+'
            description_pattern = r'(?:AM|PM)\n([\s\S]*?)(?=\nTransaction ID)'
            
            # Extract data using regex patterns
            dates = re.findall(date_pattern, text)
            types = re.findall(type_pattern, text)
            times = re.findall(time_pattern, text)
            amounts = re.findall(amount_pattern, text)
            descriptions = re.findall(description_pattern, text)
            
            cleaned_descriptions = []
            for transaction in descriptions:
                # Split the string into parts based on '\n' separator
                parts = transaction.split('\n')
                details = parts[2].strip()
                # Append the cleaned transaction to the list
                cleaned_descriptions.append(details)
            
            # Combine extracted data into a list of dictionaries
            for date, typ, time, amount, description in zip(dates, types, times, amounts, cleaned_descriptions):
                transaction_data = {
                    "Date": date,
                    "Type": typ,
                    "Time": time,
                    "Amount": amount,
                    "Description": description
                }
                transactions.append(transaction_data)
    
    return transactions

# Path to the PDF file
pdf_file_path = "./pavan.pdf"

# Extract transactions from the PDF
transactions = extract_transactions_from_pdf(pdf_file_path)

# Convert transactions to a DataFrame
df = pd.DataFrame(transactions)

# Print the DataFrame
print(df)

             Date    Type      Time  Amount  \
0    May 01, 2024   DEBIT  01:25 PM     ₹20   
1    May 01, 2024   DEBIT  10:52 AM     ₹30   
2    Apr 30, 2024   DEBIT  05:14 PM     ₹40   
3    Apr 30, 2024   DEBIT  01:27 PM     ₹80   
4    Apr 30, 2024   DEBIT  01:14 PM     ₹20   
..            ...     ...       ...     ...   
651  Feb 01, 2024   DEBIT  05:43 PM     ₹35   
652  Feb 01, 2024  CREDIT  02:22 PM  ₹1,000   
653  Feb 01, 2024  CREDIT  02:19 PM     ₹49   
654  Feb 01, 2024   DEBIT  10:35 AM     ₹15   
655  Feb 01, 2024   DEBIT  10:21 AM     ₹20   

                                       Description  
0                                 Paid to Saif Ali  
1                         Paid to Ss general store  
2                             Paid to Deepak Kumar  
3    Paid to NASIR ENTERPRISE FRUIT AND VEGETABLES  
4                      Paid to SURJIT KUMAR SANDHU  
..                                             ...  
651                       Paid to Ss general store  
652        

In [24]:
df.head()

Unnamed: 0,Date,Type,Time,Amount,Description
0,"May 01, 2024",DEBIT,01:25 PM,₹20,Paid to Saif Ali
1,"May 01, 2024",DEBIT,10:52 AM,₹30,Paid to Ss general store
2,"Apr 30, 2024",DEBIT,05:14 PM,₹40,Paid to Deepak Kumar
3,"Apr 30, 2024",DEBIT,01:27 PM,₹80,Paid to NASIR ENTERPRISE FRUIT AND VEGETABLES
4,"Apr 30, 2024",DEBIT,01:14 PM,₹20,Paid to SURJIT KUMAR SANDHU


In [25]:
import pandas as pd

# Assuming df is your DataFrame and 'Date' is the column containing dates
df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
df['Amount'] = df['Amount'].str.replace('₹', '').str.replace(',', '').astype(float)

In [13]:
df.describe()

Unnamed: 0,Date,Amount
count,656,656.0
mean,2024-03-18 20:27:04.390243840,663.004299
min,2024-02-01 00:00:00,10.0
25%,2024-02-24 00:00:00,20.0
50%,2024-03-18 00:00:00,70.0
75%,2024-04-14 00:00:00,200.0
max,2024-05-01 00:00:00,40000.0
std,,2781.050797


In [14]:
df.head()

Unnamed: 0,Date,Type,Time,Amount,Description
0,2024-05-01,DEBIT,01:25 PM,20.0,Paid to Saif Ali
1,2024-05-01,DEBIT,10:52 AM,30.0,Paid to Ss general store
2,2024-04-30,DEBIT,05:14 PM,40.0,Paid to Deepak Kumar
3,2024-04-30,DEBIT,01:27 PM,80.0,Paid to NASIR ENTERPRISE FRUIT AND VEGETABLES
4,2024-04-30,DEBIT,01:14 PM,20.0,Paid to SURJIT KUMAR SANDHU


In [3]:
import pdfplumber
import re
import pandas as pd

def extract_transactions_from_pdf(pdf_file_path):
    transactions = []
    with pdfplumber.open(pdf_file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            # Define patterns for extracting different fields
            date_pattern = r'\w{3}\s\d{1,2},\s\d{4}'  # Match date in format "Apr 28, 2024"
            type_pattern = r'(DEBIT|CREDIT)'
            amount_pattern = r'₹\d+(?:,\d+)*\.\d+'
            payee_pattern = r'(?:AM|PM)\n(.*?)(?=\nTransaction ID)'

            # Extract data using regex patterns
            dates = re.findall(date_pattern, text)
            types = re.findall(type_pattern, text)
            amounts = re.findall(amount_pattern, text)
            payees = re.findall(payee_pattern, text, re.DOTALL)

            # Combine extracted data into a list of dictionaries
            for date, type_, amount, payee in zip(dates, types, amounts, payees):
                transaction_data = {
                    "Date": date,
                    "Type": type_,
                    "Amount": amount,
                    "Payee": payee.strip()
                }
                transactions.append(transaction_data)
    return transactions

# Path to the PDF file
pdf_file_path = "./pavan.pdf"

# Extract transactions from the PDF
transactions = extract_transactions_from_pdf(pdf_file_path)

# Convert transactions to a DataFrame
df = pd.DataFrame(transactions)

# Print the DataFrame
print(df)

Empty DataFrame
Columns: []
Index: []


In [2]:
df.head()

Unnamed: 0,Date,Type,Time,Amount,Description
0,Date Transaction Details Type Amount,May,1,2024,Paid to Saif Ali DEBIT ₹20
1,Paid by XX1214,May,1,2024,Paid to Ss general store DEBIT ₹30
2,Paid by XX1214,Apr,30,2024,Paid to Deepak Kumar DEBIT ₹40
3,Paid by XX1214,Apr,30,2024,Paid to NASIR ENTERPRISE FRUIT AND VEGETABLES ...
4,Paid by XX1214,Apr,30,2024,Paid to SURJIT KUMAR SANDHU DEBIT ₹20
