In [16]:
import pandas as pd
import numpy as np
from datetime import datetime

# Configuration
num_rows = 10_000
num_users = 250
start_date = pd.to_datetime("2024-09-02")
end_date = pd.to_datetime("2024-10-27")
date_range = pd.date_range(start=start_date, end=end_date)
verticals_categories= ['Restaurant', 'Turbo', 'CPGs', 'Travel']
rows_per_category = num_rows // len(verticals_categories)  # 2500

np.random.seed(50)  # For reproducibility

# Generate columns
order_ids = np.arange(1, num_rows + 1)
dates = np.random.choice(date_range, size=num_rows)
user_ids = np.random.choice(np.arange(1, num_users + 1), size=num_rows)
verticals = np.array(verticals_categories * rows_per_category)
np.random.shuffle(verticals)

# Create DataFrame
raw_data = pd.DataFrame({
    'ORDER_ID': order_ids,
    'DATE': dates,
    'USER_ID': user_ids,
    'VERTICAL': verticals
})

raw_data


Unnamed: 0,ORDER_ID,DATE,USER_ID,VERTICAL
0,1,2024-10-20,17,Restaurant
1,2,2024-10-04,55,Travel
2,3,2024-09-13,60,Travel
3,4,2024-10-17,135,Restaurant
4,5,2024-10-05,140,Restaurant
...,...,...,...,...
9995,9996,2024-09-03,80,Restaurant
9996,9997,2024-09-23,203,Restaurant
9997,9998,2024-10-22,56,Turbo
9998,9999,2024-09-08,15,Restaurant


In [17]:
raw_data['VERTICAL'].value_counts(normalize=True)


VERTICAL
Restaurant    0.25
Travel        0.25
Turbo         0.25
CPGs          0.25
Name: proportion, dtype: float64

In [18]:
raw_data['USER_ID'].nunique()

250

In [19]:
raw_data['ORDER_ID'].is_unique

True

In [25]:
#Export to CSV
raw_data.to_csv('../data/raw_data.csv', index=False)