In [1]:
import pandas as pd
import numpy as np
import random
import re  # Make sure re is imported at the top level
from datetime import datetime, timedelta
import csv

# Set random seed for reproducibility
np.random.seed(42)

# Configuration
NUM_TRANSACTIONS = 500
START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2025, 5, 1)
INITIAL_BALANCE = 50000

# Define categories and their corresponding remark templates
CATEGORIES = {
    "Insurance": {
        "Life": [
            "Premium payment for life insurance policy #{policy_num}",
            "Annual renewal of life insurance with {company}",
            "Term life insurance premium - policy #{policy_num}",
            "Life coverage plan payment to {company}",
            "Life insurance installment #{installment_num} of {total_installments}"
        ],
        "Health": [
            "Health insurance premium for {period}",
            "Medical insurance payment to {company}",
            "Health coverage renewal #{policy_num}",
            "Family health plan premium",
            "Annual health insurance payment"
        ],
        "Motor": [
            "Car insurance premium for {car_model}",
            "Auto insurance renewal - policy #{policy_num}",
            "Vehicle insurance payment for {period}",
            "Two-wheeler insurance for {bike_model}",
            "Commercial vehicle insurance - {vehicle_type}"
        ],
        "Home": [
            "Home insurance premium - property at {address}",
            "Housing protection plan payment",
            "Property insurance renewal",
            "Building insurance annual payment",
            "Home contents insurance premium"
        ],
        "Travel": [
            "Travel insurance for {destination} trip",
            "International travel coverage plan",
            "Holiday insurance package",
            "Business trip insurance coverage",
            "Annual multi-trip insurance renewal"
        ]
    },
    "Bills": [
        "Electricity bill payment for {month}",
        "Water utility bill - {month}",
        "Mobile phone bill - {number}",
        "Internet service payment to {provider}",
        "Gas bill for {month}",
        "Cable TV subscription renewal",
        "Landline phone bill payment"
    ],
    "Shopping": [
        "Purchase at {store} - clothing",
        "Grocery shopping at {supermarket}",
        "Electronics purchase - {item}",
        "Online order from {website}",
        "Furniture purchase at {store}",
        "Home appliance - {item}",
        "Bookstore purchase"
    ],
    "Dining": [
        "Dinner at {restaurant}",
        "Lunch payment - {place}",
        "Coffee and snacks at {cafe}",
        "Food delivery from {service}",
        "Breakfast at {place}"
    ],
    "Transportation": [
        "Fuel payment at {station}",
        "Public transport monthly pass",
        "Taxi fare - {service}",
        "Car service and maintenance",
        "Parking fee payment",
        "Highway toll payment",
        "Ride-sharing service - {service}"
    ],
    "Entertainment": [
        "Movie tickets at {cinema}",
        "Concert tickets - {event}",
        "Streaming service subscription - {service}",
        "Theme park entry fees",
        "Sports event tickets - {event}",
        "Gaming subscription renewal"
    ],
    "Healthcare": [
        "Doctor's consultation fee - Dr. {name}",
        "Pharmacy purchase at {pharmacy}",
        "Hospital bill payment - {hospital}",
        "Dental treatment at {clinic}",
        "Laboratory test charges - {lab}",
        "Physiotherapy session payment"
    ],
    "Education": [
        "Tuition fee payment - {institution}",
        "Book purchase for {course} course",
        "Online course subscription - {platform}",
        "School supplies purchase",
        "Educational workshop fee - {topic}",
        "Student loan repayment"
    ],
    "Investment": [
        "Mutual fund investment - {fund_name}",
        "Stock purchase - {stock_symbol}",
        "Fixed deposit creation",
        "Retirement fund contribution",
        "Bond purchase - {bond_type}",
        "Dividend received from {company}"
    ],
    "Salary": [
        "Monthly salary from {employer}",
        "Bonus payment - {period}",
        "Commission payment",
        "Overtime payment",
        "Contract payment - {project}"
    ]
}

# Template data for remark generation
TEMPLATE_DATA = {
    "company": ["Prudential", "AXA", "MetLife", "Aviva", "Allianz", "Zurich", "Liberty", "Progressive", "Cigna", "Aetna"],
    "policy_num": [f"POL{random.randint(10000, 99999)}" for _ in range(20)],
    "installment_num": list(range(1, 13)),
    "total_installments": [12, 24, 36, 48],
    "period": ["Jan-Mar 2024", "Apr-Jun 2024", "Jul-Sep 2024", "Oct-Dec 2024", "Jan-Mar 2025", "2024-2025", "H1 2024", "H2 2024"],
    "car_model": ["Honda Civic", "Toyota Camry", "Ford Focus", "Hyundai Elantra", "Nissan Altima", "Mazda 3", "BMW 3 Series", "Audi A4"],
    "bike_model": ["Honda CBR", "Yamaha YZF", "Kawasaki Ninja", "Ducati Monster", "Harley Davidson", "Royal Enfield", "Suzuki Hayabusa"],
    "vehicle_type": ["Truck", "Van", "Bus", "Commercial SUV", "Delivery Vehicle"],
    "address": ["123 Main St", "456 Oak Ave", "789 Pine Rd", "101 Maple Dr", "202 Cedar Ln", "303 Elm Blvd", "404 Birch Ct"],
    "destination": ["Europe", "Asia", "North America", "Australia", "Africa", "South America", "Caribbean", "Middle East"],
    "month": ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"],
    "number": [f"+1-555-{random.randint(1000, 9999)}" for _ in range(10)],
    "provider": ["AT&T", "Verizon", "T-Mobile", "Comcast", "Spectrum", "Cox", "Dish", "DirecTV"],
    "store": ["H&M", "Zara", "Macy's", "Nordstrom", "Target", "Walmart", "Best Buy", "IKEA", "Home Depot", "Lowe's"],
    "supermarket": ["Kroger", "Safeway", "Whole Foods", "Trader Joe's", "Albertsons", "Publix", "Aldi", "Costco"],
    "item": ["Laptop", "Smartphone", "Refrigerator", "Washing Machine", "TV", "Microwave", "Vacuum Cleaner", "Air Purifier"],
    "website": ["Amazon", "eBay", "Walmart.com", "Target.com", "BestBuy.com", "Wayfair", "Etsy", "Overstock"],
    "restaurant": ["Olive Garden", "Cheesecake Factory", "Outback Steakhouse", "Applebee's", "Red Lobster", "TGI Fridays", "Chili's"],
    "place": ["Starbucks", "Panera Bread", "Subway", "McDonald's", "Burger King", "Chipotle", "Panda Express", "Local Diner"],
    "cafe": ["Starbucks", "Costa Coffee", "Dunkin'", "Tim Hortons", "Peet's Coffee", "Blue Bottle", "Local Cafe"],
    "service": ["DoorDash", "Uber Eats", "Grubhub", "Postmates", "Instacart", "Uber", "Lyft", "Didi", "Ola"],
    "station": ["Shell", "BP", "Exxon", "Chevron", "Texaco", "Marathon", "Mobil", "Sunoco"],
    "cinema": ["AMC", "Regal", "Cinemark", "IMAX", "Alamo Drafthouse", "Landmark Theatres", "Local Cinema"],
    "event": ["Taylor Swift Concert", "NBA Game", "NFL Match", "MLB Game", "Broadway Show", "Music Festival", "Comedy Show"],
    "name": ["Smith", "Johnson", "Williams", "Brown", "Jones", "Miller", "Davis", "Garcia", "Rodriguez", "Wilson"],
    "pharmacy": ["CVS", "Walgreens", "Rite Aid", "Duane Reade", "Walmart Pharmacy", "Target Pharmacy", "Local Pharmacy"],
    "hospital": ["General Hospital", "Memorial Hospital", "University Medical Center", "Community Hospital", "Regional Medical Center"],
    "clinic": ["Smile Dental", "Perfect Smile", "Dental Associates", "Family Dental", "City Dental Clinic"],
    "lab": ["Quest Diagnostics", "LabCorp", "BioReference", "Sonic Healthcare", "Medical Labs", "Diagnostic Center"],
    "institution": ["University of California", "State University", "Community College", "Technical Institute", "Business School"],
    "course": ["Mathematics", "Computer Science", "Biology", "Physics", "Literature", "History", "Economics", "Psychology"],
    "platform": ["Coursera", "Udemy", "edX", "Skillshare", "LinkedIn Learning", "Masterclass", "Khan Academy"],
    "topic": ["Data Science", "Digital Marketing", "Leadership", "Project Management", "Creative Writing", "Web Development"],
    "fund_name": ["Vanguard Total Market", "Fidelity 500 Index", "T Rowe Price Growth", "BlackRock Global", "PIMCO Income Fund"],
    "stock_symbol": ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "TSLA", "JPM", "JNJ", "PG", "V"],
    "bond_type": ["Treasury", "Municipal", "Corporate", "High-Yield", "International"],
    "employer": ["ABC Corporation", "XYZ Industries", "Global Solutions", "Tech Innovations", "National Services", "United Enterprises"],
    "project": ["Marketing Campaign", "Software Development", "Consulting Project", "Design Work", "Research Study", "Training Program"]
}

def random_date(start_date, end_date):
    """Generate a random date between start_date and end_date"""
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    return start_date + timedelta(days=random_number_of_days)

def format_date(date):
    """Format date as DD/MM/YYYY"""
    return date.strftime("%d/%m/%Y")

def generate_reference_number():
    """Generate a unique reference number for transactions"""
    return f"REF{random.randint(100000, 999999)}"

def generate_amount(category, sub_category=None):
    """Generate appropriate amount based on category"""
    if category == "Salary":
        return random.uniform(2500, 7500)
    elif category == "Investment" and sub_category and "Dividend" in sub_category:
        return random.uniform(50, 500)
    elif category == "Insurance":
        if sub_category == "Life":
            return random.uniform(100, 1000)
        elif sub_category == "Health":
            return random.uniform(150, 800)
        elif sub_category == "Motor":
            return random.uniform(200, 600)
        elif sub_category == "Home":
            return random.uniform(100, 400)
        elif sub_category == "Travel":
            return random.uniform(50, 300)
        else:
            return random.uniform(100, 500)
    elif category == "Bills":
        return random.uniform(50, 300)
    elif category == "Shopping":
        return random.uniform(20, 500)
    elif category == "Dining":
        return random.uniform(15, 200)
    elif category == "Transportation":
        return random.uniform(10, 150)
    elif category == "Entertainment":
        return random.uniform(20, 200)
    elif category == "Healthcare":
        return random.uniform(30, 600)
    elif category == "Education":
        return random.uniform(50, 1000)
    elif category == "Investment":
        return random.uniform(500, 5000)
    else:
        return random.uniform(10, 1000)

def generate_remark(category, sub_category=None):
    """Generate a detailed remark based on category and sub-category"""
    if category == "Insurance" and sub_category:
        template = random.choice(CATEGORIES["Insurance"][sub_category])
    else:
        template = random.choice(CATEGORIES.get(category, ["Payment - miscellaneous"]))

    # Fixed: Properly extract variable names from the template
    # Find all variables in the format {variable_name}
    variables = re.findall(r'\{(.+?)\}', template)

    # Replace each variable with a random value from the template data
    for var_name in variables:
        if var_name in TEMPLATE_DATA:
            template = template.replace(f"{{{var_name}}}", str(random.choice(TEMPLATE_DATA[var_name])))

    return template

def generate_transaction_data():
    """Generate a comprehensive transaction dataset"""
    transactions = []
    balance = INITIAL_BALANCE

    # Generate transactions
    dates = sorted([random_date(START_DATE, END_DATE) for _ in range(NUM_TRANSACTIONS)])

    for date in dates:
        is_deposit = random.random() < 0.3  # 30% chance of being a deposit

        if is_deposit:
            categories = ["Salary", "Investment"]
            weights = [0.7, 0.3]  # 70% salary, 30% investment
            category = random.choices(categories, weights=weights)[0]

            if category == "Investment" and random.random() < 0.4:  # 40% chance of dividend
                sub_category = "Dividend received from {company}"
            else:
                sub_category = None

            withdrawal = 0
            deposit = round(generate_amount(category, sub_category), 2)
            balance += deposit
        else:
            # For withdrawals, select a category
            categories_list = list(CATEGORIES.keys())
            if "Insurance" in categories_list:
                categories_list.remove("Insurance")
                # Add back insurance sub-categories as separate options
                for sub in CATEGORIES["Insurance"]:
                    categories_list.append(f"Insurance:{sub}")

            category_choice = random.choice(categories_list)

            if ":" in category_choice:
                category, sub_category = category_choice.split(":")
            else:
                category, sub_category = category_choice, None

            deposit = 0
            withdrawal = round(generate_amount(category, sub_category), 2)
            balance -= withdrawal

        # Generate remark
        remark = generate_remark(category, sub_category)

        # Create transaction record
        transaction = {
            "Date": format_date(date),
            "Remark": remark,
            "RefNo": generate_reference_number(),
            "ValueDate": format_date(date + timedelta(days=random.randint(0, 2))),
            "Withdrawal": withdrawal if withdrawal > 0 else "",
            "Deposit": deposit if deposit > 0 else "",
            "Balance": round(balance, 2),
            "Category": category if sub_category is None else f"{category}-{sub_category}"
        }

        transactions.append(transaction)

    return transactions

def save_to_csv(transactions, filename="transaction_dataset.csv"):
    """Save transactions to CSV file"""
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ["Date", "Remark", "RefNo", "ValueDate", "Withdrawal", "Deposit", "Balance", "Category"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for transaction in transactions:
            writer.writerow(transaction)

    print(f"Dataset generated and saved to {filename}")
    return filename

if __name__ == "__main__":
    # Generate transactions
    print(f"Generating {NUM_TRANSACTIONS} transactions...")
    transactions = generate_transaction_data()

    # Save to CSV
    csv_file = save_to_csv(transactions)

    # Display sample of the data
    df = pd.read_csv(csv_file)
    print("\nSample of generated data:")
    print(df.head())

    # Display statistics
    print("\nTransaction category distribution:")
    print(df["Category"].value_counts())

    print(f"\nTotal withdrawals: ${df['Withdrawal'].sum():.2f}")
    print(f"Total deposits: ${df['Deposit'].sum():.2f}")
    print(f"Final balance: ${df['Balance'].iloc[-1]:.2f}")

Generating 500 transactions...
Dataset generated and saved to transaction_dataset.csv

Sample of generated data:
         Date                                 Remark      RefNo   ValueDate  \
0  01/01/2024      Laboratory test charges - LabCorp  REF130234  01/01/2024   
1  01/01/2024                     Bookstore purchase  REF515922  02/01/2024   
2  01/01/2024                 Stock purchase - GOOGL  REF877078  02/01/2024   
3  03/01/2024  Contract payment - Consulting Project  REF812750  04/01/2024   
4  03/01/2024                     Commission payment  REF918059  03/01/2024   

   Withdrawal  Deposit   Balance    Category  
0      564.37      NaN  49435.63  Healthcare  
1      164.69      NaN  49270.94    Shopping  
2         NaN  3665.31  52936.25  Investment  
3     2938.36      NaN  49997.89      Salary  
4         NaN  5453.38  55451.27      Salary  

Transaction category distribution:
Category
Salary                                         154
Investment                        