In [1]:
!pip install faker fpdf2 pandas

Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Collecting fpdf2
  Downloading fpdf2-2.8.3-py2.py3-none-any.whl.metadata (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fpdf2-2.8.3-py2.py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.7/245.7 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fpdf2, faker
Successfully installed faker-37.3.0 fpdf2-2.8.3


In [2]:
from faker import Faker
from fpdf import FPDF
import pandas as pd
import random
import os

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
pdf_output_dir = "/content/drive/MyDrive/invoice_project/invoices"
os.makedirs(pdf_output_dir, exist_ok=True)

In [5]:
metadata_path = "/content/drive/MyDrive/invoice_project/invoice_metadata.csv"

In [6]:
fake = Faker()

In [7]:
invoice_items = [
    "Consulting Fee", "Software License", "Cloud Storage", "Laptop", "Monitor",
    "Keyboard", "Mouse", "External Hard Drive", "Website Development", "SEO Services",
    "Digital Marketing", "Graphic Design", "Logo Design", "Domain Registration",
    "Web Hosting", "IT Support", "Software Maintenance", "Data Backup Service",
    "Email Hosting", "Training Session", "Online Course", "Subscription Renewal",
    "Legal Consultation", "Accounting Service", "Tax Filing", "Project Management",
    "Photography Session", "Printing Service", "Brochure Design", "Business Cards",
    "Office Chair", "Office Desk", "Ergonomic Keyboard", "Video Editing",
    "Server Hosting", "Mobile App Development", "Social Media Management",
    "Ad Campaign", "Freelance Writing", "Technical Documentation",
    "Market Research Report", "CRM Setup", "Analytics Dashboard",
    "Cloud Migration", "Pen Drive", "Phone Charger", "Team Meeting Room Booking",
    "Product Demo", "Prototype Development", "Customer Support Plan"
]

In [8]:
def generate_invoice(idx, is_fraud=False):
    vendor = fake.company()
    invoice_id = fake.bothify(text='INV####')
    date = fake.date()
    item = random.choice(invoice_items)
    amount = round(random.uniform(100, 1000), 2)
    tax = round(amount * 0.18, 2)
    total = amount + tax

    if is_fraud:
        anomaly_type = random.choice(["negative_tax", "inflated_amount", "missing_vendor"])
        if anomaly_type == "negative_tax":
            tax = -abs(tax)
        elif anomaly_type == "inflated_amount":
            amount *= 5
            total = amount + tax
        elif anomaly_type == "missing_vendor":
            vendor = ""

    # Create PDF
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    pdf.cell(200, 10, txt=f"Invoice #{invoice_id}", ln=True, align='C')
    pdf.ln(10)
    pdf.cell(100, 10, txt=f"Date: {date}", ln=True)
    pdf.cell(100, 10, txt=f"Vendor: {vendor}", ln=True)
    pdf.cell(100, 10, txt=f"Item: {item}", ln=True)
    pdf.cell(100, 10, txt=f"Amount: ${amount:.2f}", ln=True)
    pdf.cell(100, 10, txt=f"Tax: ${tax:.2f}", ln=True)
    pdf.cell(100, 10, txt=f"Total: ${total:.2f}", ln=True)

    filename = f"{pdf_output_dir}/invoice_{idx}.pdf"
    pdf.output(filename)

    return {
        "filename": filename,
        "invoice_id": invoice_id,
        "date": date,
        "vendor": vendor,
        "item": item,
        "amount": amount,
        "tax": tax,
        "total": total,
        "is_fraud": int(is_fraud)
    }

In [9]:
# Generating dataset
num_samples = 100
fraud_ratio = 0.3

In [10]:
data = []
for i in range(num_samples):
    is_fraud = random.random() < fraud_ratio
    invoice_data = generate_invoice(i, is_fraud=is_fraud)
    data.append(invoice_data)

  pdf.set_font("Arial", size=12)
  pdf.cell(200, 10, txt=f"Invoice #{invoice_id}", ln=True, align='C')
  pdf.cell(200, 10, txt=f"Invoice #{invoice_id}", ln=True, align='C')
  pdf.cell(100, 10, txt=f"Date: {date}", ln=True)
  pdf.cell(100, 10, txt=f"Date: {date}", ln=True)
  pdf.cell(100, 10, txt=f"Vendor: {vendor}", ln=True)
  pdf.cell(100, 10, txt=f"Vendor: {vendor}", ln=True)
  pdf.cell(100, 10, txt=f"Item: {item}", ln=True)
  pdf.cell(100, 10, txt=f"Item: {item}", ln=True)
  pdf.cell(100, 10, txt=f"Amount: ${amount:.2f}", ln=True)
  pdf.cell(100, 10, txt=f"Amount: ${amount:.2f}", ln=True)
  pdf.cell(100, 10, txt=f"Tax: ${tax:.2f}", ln=True)
  pdf.cell(100, 10, txt=f"Tax: ${tax:.2f}", ln=True)
  pdf.cell(100, 10, txt=f"Total: ${total:.2f}", ln=True)
  pdf.cell(100, 10, txt=f"Total: ${total:.2f}", ln=True)


In [11]:
df = pd.DataFrame(data)
df.to_csv(metadata_path, index=False)