# Extract info from PDFs

## Import libraries

In [5]:
import os
import re
import pypdf
import pandas as pd

## View PDF text

### View actual PDF

In [6]:
# Function to print the content of a PDF document
def print_pdf_content(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        for page_num in range(len(reader.pages)):
            print(f"Page {page_num + 1}:\n")
            page = reader.pages[page_num]
            page_text = page.extract_text()
            print(page_text)
            print("\n" + "#" * 100 + "\n")  # Print a separator between pages


# Now call the function to print the content of the document
print_pdf_content(r'C:\Users\Isaac\Documents\1 - Excel Python Environment\PDFs CC extracts\PDFs\creditcard.pdf')

Page 1:

Credit Card information Payment due on: 27 JUL 2020
Jim Smith
123 Cherry Lane st.
956004 Marietta.
Atlanta / GA
 DateDescription Bonus Amount
24 JUN 2019 07:12 Shell Marietta - Marie's $0.99 $64.10
 25 JUN 2019 17:55 Downson's Market - Atlanta $0.59 $87.22
 25 JUN 2019 09:22 KMART - GA $0.80 $106.90
 2 JUL 2019 18:12 Florida Shadow Hotel $1.25 $90.00
 3 JUL 2019 21:12 Cafe de'Orient - FL $0.02 $12.20
 3 JUL 2019 12:54 Yuen Shirts $1.01 $59.10
 10 JUL 2019 08:10 Delta Airlines $2.00 $120.00
 11 JUL 2019 19:01 Publix - Marietta $0.40 $82.99
 13 JUL 2019 08:11 Jiffy Lube $0.10 $40.20
 15 JUL 2019 07:40 Starbucks Roswell Road $0.04 $8.10
 TOTAL AMOUNT $7.20 $670.81Customer Number 12302139
Card Number 112321***3
Card Limit 1500
Pay Date 27 JUL 2020
Amount $670.81

####################################################################################################



## Functions definition

### process_pdf

In [7]:
import pypdf

# Step 1: Initialize and Read PDF
def read_pdf(file_path):
    print(f"\nProcessing file: {file_path}\n")
    text_content = []
    with open(file_path, 'rb') as file:
        pdf_reader = pypdf.PdfReader(file)  
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text_content.append(page.extract_text() if page.extract_text() else '')  # Handle None return
    return text_content

# Step 2: Concatenate and Preprocess Text
def preprocess_text(text_pages):
    full_text = ' '.join(text_pages)
    # Normalize text (e.g., remove excessive whitespace, lowercase)
    full_text = ' '.join(full_text.split()).lower()
    return full_text

# Main function to process the PDF
def process_pdf(file_path):
    print(f"\n==== Processing PDF: {file_path} ====\n")
    text_content = read_pdf(file_path)
    processed_text = preprocess_text(text_content)
    return processed_text

### extract_transactions

In [8]:
import re
import pandas as pd
from datetime import datetime

def extract_transactions(text):
    # Updated pattern to capture new date format and two amounts (cash back and charge amount)
    pattern = r'(\d{2} [a-z]{3} \d{4}) (\d{2}:\d{2}) (.*?) \$([\d\.]+) \$([\d\.]+)'
    transactions = re.findall(pattern, text.replace('\n', ' '), flags=re.DOTALL)
    
    transactions_data = []
    for transaction in transactions:
        date, time, description, cash_back, charge_amount = transaction
        # Convert date from "dd MMM yyyy" format to "dd/mm/yyyy"
        date = datetime.strptime(date, '%d %b %Y').strftime('%d/%m/%Y')
        
        # Append two separate records for cash back and charge amount
        if float(cash_back) > 0:
            transactions_data.append({
                'Date': date,
                'Description': description.strip(),
                'Amount': f'-${cash_back}',
                'Type': 'Cash Back'
            })
        if float(charge_amount) > 0:
            transactions_data.append({
                'Date': date,
                'Description': description.strip(),
                'Amount': f'${charge_amount}',
                'Type': 'Charge'
            })
    
    transactions_df = pd.DataFrame(transactions_data)
    
    # Convert 'Amount' to numeric after removing '$' and handling negative for cash back
    if not transactions_df.empty:
        transactions_df['Amount'] = pd.to_numeric(transactions_df['Amount'].replace(r'[\$,]', '', regex=True), errors='coerce')
    
    return transactions_df


## testing Process PDF & search for patterns

In [9]:
processed_text = process_pdf(r'C:\Users\Isaac\Documents\1 - Excel Python Environment\PDFs CC extracts\PDFs\creditcard.pdf')

import textwrap

wrapped_content = textwrap.fill(processed_text, width=100)  # Wrap content to 50 characters
print(f"Content (snippet):\n{wrapped_content}\n{'=' * 80}\n")


==== Processing PDF: C:\Users\Isaac\Documents\1 - Excel Python Environment\PDFs CC extracts\PDFs\creditcard.pdf ====


Processing file: C:\Users\Isaac\Documents\1 - Excel Python Environment\PDFs CC extracts\PDFs\creditcard.pdf

Content (snippet):
credit card information payment due on: 27 jul 2020 jim smith 123 cherry lane st. 956004 marietta.
atlanta / ga datedescription bonus amount 24 jun 2019 07:12 shell marietta - marie's $0.99 $64.10 25
jun 2019 17:55 downson's market - atlanta $0.59 $87.22 25 jun 2019 09:22 kmart - ga $0.80 $106.90 2
jul 2019 18:12 florida shadow hotel $1.25 $90.00 3 jul 2019 21:12 cafe de'orient - fl $0.02 $12.20 3
jul 2019 12:54 yuen shirts $1.01 $59.10 10 jul 2019 08:10 delta airlines $2.00 $120.00 11 jul 2019
19:01 publix - marietta $0.40 $82.99 13 jul 2019 08:11 jiffy lube $0.10 $40.20 15 jul 2019 07:40
starbucks roswell road $0.04 $8.10 total amount $7.20 $670.81customer number 12302139 card number
112321***3 card limit 1500 pay date 27 jul 2020 amount $6

# Working code

In [35]:
transactions_df = extract_transactions(processed_text)
print(transactions_df)

          Date                 Description  Amount       Type
0   24/06/2019    shell marietta - marie's   -0.99  Cash Back
1   24/06/2019    shell marietta - marie's   64.10     Charge
2   25/06/2019  downson's market - atlanta   -0.59  Cash Back
3   25/06/2019  downson's market - atlanta   87.22     Charge
4   25/06/2019                  kmart - ga   -0.80  Cash Back
5   25/06/2019                  kmart - ga  106.90     Charge
6   10/07/2019              delta airlines   -2.00  Cash Back
7   10/07/2019              delta airlines  120.00     Charge
8   11/07/2019           publix - marietta   -0.40  Cash Back
9   11/07/2019           publix - marietta   82.99     Charge
10  13/07/2019                  jiffy lube   -0.10  Cash Back
11  13/07/2019                  jiffy lube   40.20     Charge
12  15/07/2019      starbucks roswell road   -0.04  Cash Back
13  15/07/2019      starbucks roswell road    8.10     Charge
