In [1]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [2]:
#parameters
n_rows = 20000
verbs = ['bought', 'returned', 'sold', 'ordered', 'paid', 'cancelled']
objects = ['book', 'product', 'service', 'phone', 'laptop', 'accessory']
start_date = datetime(2022, 1, 1)
end_date = datetime(2025, 12, 31)

In [7]:
#random date generation function
def random_date(start, end):
  delta = end-start
  random_day = random.randint(0, delta.days)
  return (start + timedelta(days=random_day)).strftime('%Y-%m-%d')

Explanations:

The main columns (subject, verb, object) are always filled.

Additional columns (time, amount) are generated with probability:
time: 80% filled, 20% — [PAD].
the amount: 70% filled, 30% -1.

The mask indicates whether the columns are full:
[1, 1, 1, mask_time, mask_sum].

For the amount:
Positive values for actions other than returned.
Negative values for returned.

In [9]:
#data generation
data = []
for _ in range(n_rows):
  #SVO main columns (always filled)
  subject = f'User {random.randint(1000, 99999):05d}'
  verb = random.choice(verbs)
  obj = random.choice(objects)

  #additional columns (can be [PAD]/-1)
  mask_time = random.choices([0, 1], weights=[0.2, 0.8])[0]
  mask_sum = random.choices([0, 1], weights=[0.3, 0.7])[0]

  time = random_date(start_date, end_date) if mask_time else '[PAD]'

  if mask_sum:
      amount = random.randint(100, 5000) * (-1 if verb == 'returned' else 1)
  else:
      amount = -1

  #mask formation (the first three are 1, the rest are random)
  mask = [1, 1, 1, mask_time, mask_sum]

  data.append([subject, verb, obj, time, amount, mask])

PAD - a label to indicate missing values (important because we are writing a model that is flexible to the presence of additional columns)

In [12]:
df = pd.DataFrame(data, columns=['subject', 'verb', 'object', 'time', 'amount', 'mask'])

In [15]:
df.head()

Unnamed: 0,subject,verb,object,time,amount,mask
0,User 40578,bought,book,2023-01-11,1524,"[1, 1, 1, 1, 1]"
1,User 94768,paid,laptop,2022-11-23,-1,"[1, 1, 1, 1, 0]"
2,User 65142,paid,service,2023-03-25,-1,"[1, 1, 1, 1, 0]"
3,User 10499,returned,book,2023-12-19,-1,"[1, 1, 1, 1, 0]"
4,User 89538,ordered,phone,2025-07-13,3949,"[1, 1, 1, 1, 1]"


In [16]:
df.to_csv('transactions.csv', index=False)