In [10]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Parameters
total_records = 100000
sender_account_ids = [f"{i}" for i in range(10001, 10101)]
transaction_types = ['Debit', 'Credit', 'Payment', 'Transfer']
recipient_banks = [f"B{str(i).zfill(4)}" for i in range(1, 11)]
fraud_limit = 5000
fraud_sender_limit = 45
fraud_hours = list(range(22, 24)) + list(range(0, 6))  # 10 PM to 6 AM
fraud_days = [4, 5, 6]  # Fridays, Saturdays, Sundays

# Initialize lists for data
transaction_ids, timestamps, sender_accounts, transaction_types_list = [], [], [], []
in_out_list, amounts, recipient_bank_list, fraud_list = [], [], [], []

# Helper functions
def generate_timestamp():
    start_date = datetime(2020, 1, 1)
    end_date = datetime(2023, 12, 31)
    random_days = random.randint(0, (end_date - start_date).days)
    random_seconds = random.randint(0, 86400 - 1)  # 86400 seconds in a day
    return start_date + timedelta(days=random_days, seconds=random_seconds)

def generate_amount():
    return round(random.uniform(10, 10000), 2)

# Generate fraudulent sender accounts
fraud_sender_accounts = random.sample(sender_account_ids, fraud_sender_limit)

# Generate fraud count randomly below 5000
fraud_count = random.randint(1, fraud_limit)

# Generate dataset
for i in range(1, total_records + 1):
    # Transaction ID
    transaction_id = f"T{str(i).zfill(5)}"
    
    # Timestamp
    timestamp = generate_timestamp()
    
    # Sender Account
    sender_account = random.choice(sender_account_ids)
    
    # Transaction Type
    transaction_type = random.choice(transaction_types)
    
    # IN_OUT based on transaction type
    in_out = 'In' if transaction_type in ['Debit', 'Transfer'] and random.random() < 0.5 else 'Out'
    
    # Amount
    amount = generate_amount()
    
    # Recipient Bank
    recipient_bank = random.choice(recipient_banks)
    
    # Fraud logic
    fraud = 0
    if sender_account in fraud_sender_accounts:
        if timestamp.hour in fraud_hours or timestamp.weekday() in fraud_days:
            if random.random() < 0.2:  # 20% chance of fraud in high-risk hours/days
                fraud = 1
    if fraud_count > 0 and fraud == 1:
        fraud_count -= 1  # Decrement fraud count once assigned

    # Append data
    transaction_ids.append(transaction_id)
    timestamps.append(timestamp.strftime('%Y-%m-%d %H:%M:%S'))
    sender_accounts.append(sender_account)
    transaction_types_list.append(transaction_type)
    in_out_list.append(in_out)
    amounts.append(amount)
    recipient_bank_list.append(recipient_bank)
    fraud_list.append(fraud)

# Create dataframe
df = pd.DataFrame({
    'Transaction ID': transaction_ids,
    'Timestamp': timestamps,
    'Sender Account': sender_accounts,
    'Transaction Type': transaction_types_list,
    'IN_OUT': in_out_list,
    'Amount': amounts,
    'Recipient Bank': recipient_bank_list,
    'Fraud': fraud_list
})

# Save to CSV
output_path = "transaction_datasetv4.csv"
df.to_csv(output_path, index=False)

output_path


'transaction_datasetv4.csv'

In [11]:
import pandas as pd
pd.read_csv('transaction_datasetv4.csv')

Unnamed: 0,Transaction ID,Timestamp,Sender Account,Transaction Type,IN_OUT,Amount,Recipient Bank,Fraud
0,T00001,2021-09-08 02:59:20,10043,Payment,Out,20.45,B0003,0
1,T00002,2020-10-03 15:14:58,10005,Transfer,Out,5029.81,B0007,1
2,T00003,2022-05-18 08:13:53,10051,Debit,In,8274.73,B0007,0
3,T00004,2020-05-31 13:27:06,10098,Credit,Out,2959.03,B0002,0
4,T00005,2023-03-08 00:59:40,10052,Payment,Out,9920.37,B0005,1
...,...,...,...,...,...,...,...,...
99995,T99996,2020-11-24 15:48:44,10051,Transfer,Out,643.09,B0007,0
99996,T99997,2021-11-22 10:15:59,10073,Debit,Out,6925.02,B0007,0
99997,T99998,2021-09-15 20:58:08,10097,Payment,Out,3619.03,B0001,0
99998,T99999,2023-04-17 19:04:20,10035,Payment,Out,2767.67,B0008,0
