In [75]:
import pandas as pd
import re
from google.colab import drive
import os

In [76]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
ocr_df = pd.read_csv("/content/drive/MyDrive/invoice_project/ocr_invoice_texts.csv")

In [78]:
ocr_df.head()

Unnamed: 0,filename,extracted_text
0,invoice_0.pdf,Invoice #INVO297\n\nDate: 1979-07-29\nVendor: ...
1,invoice_1.pdf,Invoice #INV4575\n\nDate: 2019-08-31\n\nVendor...
2,invoice_3.pdf,Invoice #INV8579\n\nDate: 2017-02-20\nVendor: ...
3,invoice_4.pdf,Invoice #INV5127\n\nDate: 1982-07-24\n\nVendor...
4,invoice_2.pdf,Invoice #INV9890\n\nDate: 2023-02-16\n\nVendor...


In [79]:
def extract_fields(text):
    try:
        invoice_id = re.search(r'Invoice\s*#\s*(INV\w+)', text)
        date = re.search(r'Date:\s*([0-9]{4}-[0-9]{2}-[0-9]{2})', text)
        vendor = re.search(r'Vendor:\s*([^\n]+)', text)
        item = re.search(r'Item:\s*([^\n]+)', text)
        amount = re.search(r'Amount:\s*\$?(-?[\d.,]+)', text)
        tax = re.search(r'Tax:\s*\$?(-?[\d.,]+)', text)
        total = re.search(r'Total:\s*\$?(-?[\d.,]+)', text)

        return {
            "invoice_id": invoice_id.group(1).strip() if invoice_id else None,
            "date": date.group(1).strip() if date else None,
            "vendor": vendor.group(1).strip() if vendor else None,
            "item": item.group(1).strip() if item else None,
            "amount": float(amount.group(1).replace(',', '')) if amount else None,
            "tax": float(tax.group(1).replace(',', '')) if tax else None,
            "total": float(total.group(1).replace(',', '')) if total else None,
        }

    except Exception as e:
        print(f"Error extracting fields: {e}")
        return {
            "invoice_id": None, "date": None, "vendor": None,
            "item": None, "amount": None, "tax": None, "total": None
        }


In [80]:
extracted_data = ocr_df["extracted_text"].apply(extract_fields)
extracted_df = pd.DataFrame(list(extracted_data))
extracted_df["filename"] = ocr_df["filename"]

In [81]:
metadata_df = pd.read_csv("/content/drive/MyDrive/invoice_project/invoice_metadata.csv")

In [82]:
metadata_df = metadata_df[["filename", "is_fraud"]]

In [83]:
metadata_df["filename"] = metadata_df["filename"].apply(os.path.basename)

In [84]:
final_df = pd.merge(extracted_df, metadata_df, on="filename")

In [85]:
final_df.to_csv("/content/drive/MyDrive/invoice_project/cleaned_invoice_data.csv", index=False)