In [5]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
# Set a random seed for reproducibility
np.random.seed(42)

In [9]:
# Function to generate random datetime objects
def random_date(start_date, end_date):
    delta = end_date - start_date
    random_days = np.random.randint(0, delta.days)
    random_hours = np.random.randint(0, 23)
    random_minutes = np.random.randint(0, 59)
    
    random_date = start_date + timedelta(days=random_days, hours=random_hours, minutes=random_minutes)
    return random_date

In [13]:
# Create User attribute table
num_users = 1000
user_ids = np.arange(1, num_users + 1)
user_names = ["User_" + str(i) for i in user_ids]
ages = np.random.randint(18, 70, size=num_users)
countries = np.random.choice(['USA', 'Canada', 'UK', 'Germany', 'France', 'Japan'], num_users)



# Define start and end date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 4, 1)

# Generate a list of random datetime objects
random_datetime_list = [random_date(start_date, end_date) for _ in range(num_users)]

users_df = pd.DataFrame({
    'user_id': user_ids,
    'user_name': user_names,
    'created': random_datetime_list,
    'age': ages,
    'country': countries
})
users_df.head()

Unnamed: 0,user_id,user_name,created,age,country
0,1,User_1,2023-10-14 08:49:00,57,USA
1,2,User_2,2021-12-16 07:07:00,41,France
2,3,User_3,2023-09-03 19:46:00,20,Germany
3,4,User_4,2020-09-07 06:10:00,39,USA
4,5,User_5,2023-01-12 15:03:00,19,Germany


In [15]:
users_df.shape

(1000, 5)

In [14]:
users_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_users", if_exists='replace')

  users_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_users", if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 13400.33it/s]


In [16]:
# Create Item attribute table
num_items = 700
item_ids = np.arange(1, num_items + 1)
item_names = ["Item_" + str(i) for i in item_ids]
prices = np.round(np.random.uniform(5.99, 999.99, size=num_items), 2)
categories = np.random.choice(['Electronics', 'Fashion', 'Home', 'Toys', 'Books'], num_items)

# Define start and end date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2024, 4, 1)

# Generate a list of random datetime objects
created_list = [random_date(start_date, end_date) for _ in range(num_items)]

items_df = pd.DataFrame({
    'item_id': item_ids,
    'item_name': item_names,
    'price': prices,
    'category': categories,
    'created': created_list
})
items_df.head()

Unnamed: 0,item_id,item_name,price,category,created
0,1,Item_1,524.46,Fashion,2023-04-06 15:37:00
1,2,Item_2,795.71,Fashion,2023-09-28 21:21:00
2,3,Item_3,885.06,Fashion,2021-06-11 15:34:00
3,4,Item_4,348.7,Electronics,2021-03-19 12:45:00
4,5,Item_5,465.06,Books,2022-04-19 11:54:00


In [35]:
items_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_items", if_exists='replace')

  items_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_items", if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 11428.62it/s]


In [19]:
# Create Transaction table
num_transactions = 500
assert num_transactions < num_items
transaction_ids = np.arange(1, num_transactions + 1)
buyer_ids = np.random.choice(user_ids, num_transactions)
purchased_item_ids = np.random.choice(item_ids, num_transactions)
quantities = np.random.randint(1, 5, size=num_transactions)
dates = np.random.choice(
    pd.date_range(start='2023-01-01', periods=365, freq='D'),
    num_transactions
  )


transactions_df = pd.DataFrame({
    'transaction_id': transaction_ids,
    'user_id': buyer_ids,
    'item_id': purchased_item_ids,
    'quantity': quantities,
    # 'date': dates
})
transactions_df.head()

Unnamed: 0,transaction_id,user_id,item_id,quantity
0,1,100,256,1
1,2,658,172,4
2,3,9,649,3
3,4,574,498,4
4,5,697,471,4


In [38]:
transactions_df.item_id.nunique()

370

In [22]:
random_interval_list = [
    created + timedelta(days=np.random.randint(0, 30), hours=np.random.randint(0, 23), minutes=np.random.randint(0, 59))
    for created in created_list 
]

In [30]:
df_item_sold = pd.DataFrame({"item_id": item_ids, "sold_time": random_interval_list})

In [43]:
transactions_df = pd.merge(transactions_df, items_df[['item_id', 'created']], on='item_id', how='left')

In [46]:
transactions_df['sold_time'] = transactions_df.created.apply(lambda x: x+timedelta(
                days=np.random.randint(0, 30),
                hours=np.random.randint(0, 23),
                minutes=np.random.randint(0, 59)
            ) 
)
transactions_df = transactions_df.drop('created', axis=1)

In [47]:
transactions_df.isna().sum()

transaction_id    0
user_id           0
item_id           0
quantity          0
sold_time         0
dtype: int64

In [48]:
transactions_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_transactions", if_exists='replace')

  transactions_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_transactions", if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 13189.64it/s]


In [49]:
# Create User event log table
num_events = 1500
event_types = np.random.choice(['app start', 'item view', 'item like'], num_events)
log_user_ids = np.random.choice(user_ids, num_events)
log_item_ids = np.random.choice(item_ids, num_events)
event_times = np.random.choice(
    pd.date_range(start='2023-01-01', periods=365*24, freq='H'),
    num_events
  )

event_log_df = pd.DataFrame({
    'event_id': np.arange(1, num_events + 1),
    'event_type': event_types,
    'user_id': log_user_ids,
    'item_id': log_item_ids,
    'event_time': event_times
})
event_log_df.head()

  pd.date_range(start='2023-01-01', periods=365*24, freq='H'),


Unnamed: 0,event_id,event_type,user_id,item_id,event_time
0,1,item view,302,680,2023-06-18 01:00:00
1,2,item view,964,181,2023-10-07 04:00:00
2,3,item view,516,130,2023-06-21 09:00:00
3,4,app start,734,149,2023-09-30 18:00:00
4,5,item like,962,472,2023-12-16 20:00:00


In [50]:
event_log_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_event_logs", if_exists='replace')

  event_log_df.to_gbq("mercari-ml-crm-jp-dev.z_yilin.llm_query_experiment_event_logs", if_exists='replace')
100%|██████████| 1/1 [00:00<00:00, 13934.56it/s]


In [None]:
# # Save to CSV
# users_df.to_csv('user_attributes.csv', index=False)
# items_df.to_csv('item_attributes.csv', index=False)
# transactions_df.to_csv('transactions.csv', index=False)
# event_log_df.to_csv('event_logs.csv', index=False)

In [52]:
print("Data generation completed and saved to BQ dataset: mercari-ml-crm-jp-dev.z_yilin.")

Data generation completed and saved to BQ dataset: mercari-ml-crm-jp-dev.z_yilin.
