In [107]:
import numpy as np
import pandas as pd
import os

In [108]:
LENGTH = 2000000
FK_LENGTH = 5000
output_folder = f'data_{LENGTH}/'

In [109]:
# if output_folder does not exist, create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

## Generate Customers Dataset

In [110]:
governments_names = pd.read_csv('Seeds/governments.csv')["governorate_name_en"].values
available_names = pd.read_csv('Seeds/names.csv')['name'].unique()

In [111]:
names = np.random.choice(available_names, LENGTH)

IDs = np.arange(1, LENGTH + 1)
governments = np.random.choice(governments_names, LENGTH)
ages = np.random.randint(18, 81, LENGTH)
gender = np.random.choice([0, 1], LENGTH)

In [112]:
df = pd.DataFrame({
    'ID': IDs,
    'governorate': governments,
    'age': ages,
    'name': names,
    'gender': gender})

df.head()

Unnamed: 0,ID,governorate,age,name,gender
0,1,Assiut,45,Diane Parks,0
1,2,Luxor,31,Joe Chambers,0
2,3,Qaliubiya,66,Jamie David,1
3,4,Matrouh,66,Felicia Cabrera,1
4,5,Assiut,35,John Hamilton,1


In [113]:
df.to_csv(output_folder + '/customers.csv', index=False)

## Generate Suppliers Dataset

In [114]:
suppliers_id = np.arange(1, LENGTH+1)
suppliers_names = np.random.choice(available_names, LENGTH)

# make supplier name unique
suppliers_names = suppliers_names + suppliers_id.astype(str)

In [115]:
suppliers_df = pd.DataFrame({
    'ID': suppliers_id,
    'name': suppliers_names})

suppliers_df.head()

Unnamed: 0,ID,name
0,1,Joseph Smith1
1,2,Jorge Martin2
2,3,Melissa Payne3
3,4,Nancy Cox4
4,5,Krystal Warren5


In [116]:
suppliers_df.to_csv(output_folder + './suppliers.csv', index=False)

## Generate Products Dataset

In [117]:
products_available_names = pd.read_csv('Seeds/products.csv')["product_name"].unique()


products_IDs = np.arange(1, LENGTH + 1)
products_names = np.random.choice(products_available_names, LENGTH)
products_prices = np.random.randint(100, 10001, LENGTH)
products_categories = np.random.randint(1,  5001, LENGTH)
cid = np.random.randint(1, FK_LENGTH+1, LENGTH)



In [118]:
products_df = pd.DataFrame({
    'ID': products_IDs,
    'product_name': products_names,
    'price': products_prices,
    'category': products_categories,
    'supplier_id': cid})

products_df.head()

Unnamed: 0,ID,product_name,price,category,supplier_id
0,1,Soap Bar Ocean Mist,8427,3951,3124
1,2,Large Round Containers & Lids,5667,3266,1964
2,3,Intense Stain Whitening Toothpaste With Fluoride,7485,4105,4253
3,4,SmartBlend Lamb & Brown Rice Entree Wet Dog Food,2013,2825,483
4,5,Green Tea With Ginseng and Honey,6307,3180,623


In [119]:
# shuffle the data
products_df = products_df.sample(frac=1).reset_index(drop=True)

products_df.to_csv(output_folder + '/products.csv', index=False)

## Generate Orders Dataset

In [120]:
order_ids = np.arange(1, LENGTH + 1)
customer_ids = np.random.randint(1, FK_LENGTH + 1, LENGTH)
available_dates = pd.date_range(start='1/1/2010', end='1/1/2021')
# change the format of the dates to be like 2020-01-01
available_dates = [str(date).split()[0] for date in available_dates]
orders_dates = np.random.choice(available_dates, LENGTH)
orders_time = np.random.randint(0, 24, LENGTH)

In [121]:
order_df = pd.DataFrame({
    'ID': order_ids,
    'customer_id': customer_ids,
    'date': orders_dates,
    'hour': orders_time})

order_df.head(10)

Unnamed: 0,ID,customer_id,date,hour
0,1,4405,2017-01-28,9
1,2,682,2018-05-24,13
2,3,379,2012-11-03,12
3,4,3948,2020-03-30,13
4,5,2856,2019-01-29,14
5,6,1256,2015-06-16,16
6,7,1790,2014-09-08,19
7,8,4936,2015-10-03,1
8,9,3202,2017-12-23,4
9,10,728,2010-05-20,10


In [122]:
# shuffle the data
order_df = order_df.sample(frac=1).reset_index(drop=True)

order_df.to_csv(output_folder + '/orders.csv', index=False)

## Generate Order Items

In [123]:
import itertools

products_ids = np.random.choice(np.arange(1, FK_LENGTH + 1), 2000)
products_ids = np.unique(products_ids)

orders_ids = np.random.choice(np.arange(1, FK_LENGTH + 1), 2000)
orders_ids = np.unique(orders_ids)

# get all the possible combinations of the customers ids and the products ids
order_items = list(itertools.product(orders_ids, products_ids))

# shuffle the data
np.random.shuffle(order_items)

order_items = order_items[:LENGTH]

In [124]:
oid = [item[0] for item in order_items]
pid = [item[1] for item in order_items]
quantity = np.random.randint(1, 100, len(order_items))

In [125]:
# oid_mapper = {val: i+1 for i, val in enumerate(np.unique(oid))}
# pid_mapper = {val: i+1 for i, val in enumerate(np.unique(pid))}
# oid = [oid_mapper[order_id] for order_id in oid]
# pid = [pid_mapper[product_id] for product_id in pid]


In [126]:
order_items_df = pd.DataFrame({
    'order_id': oid,
    'product_id': pid,
    'quantity': quantity
})
print(np.unique(order_items_df['order_id']))
order_items_df.head()

[   8   12   17 ... 4995 4996 4999]


Unnamed: 0,order_id,product_id,quantity
0,3942,123,75
1,4631,4694,9
2,3219,4135,18
3,1120,2983,12
4,3554,1527,16


In [127]:
order_items_df.to_csv(output_folder + '/order_items.csv', index=False)

## Generate Ratings Dataset

In [128]:
import itertools

customers_ids = np.random.choice(np.arange(1, FK_LENGTH + 1), 2000)
customers_ids = np.unique(customers_ids)

products_ids = np.random.choice(np.arange(1, FK_LENGTH + 1), 2000)
products_ids = np.unique(products_ids)

# get all the possible combinations of the customers ids and the products ids
customers_products = list(itertools.product(customers_ids, products_ids))

# shuffle the data
np.random.shuffle(customers_products)

In [129]:
customers_products = customers_products[:LENGTH]

In [130]:
ratings = np.random.randint(1, 6, LENGTH)
cid = [x[0] for x in customers_products]
pid = [x[1] for x in customers_products]

In [131]:
# cid_mapper = {val: i+1 for i, val in enumerate(np.unique(cid))}
# pid_mapper = {val: i+1 for i, val in enumerate(np.unique(pid))}
# cid = [cid_mapper[customer_id] for customer_id in cid]
# pid = [pid_mapper[product_id] for product_id in pid]

In [132]:
print(np.unique(cid))

[   1    2    5 ... 4995 4997 4998]


In [133]:
ratings_df = pd.DataFrame({
    'product_id': pid,
    'customer_id': cid,
    'rating': ratings})

ratings_df.head()

Unnamed: 0,product_id,customer_id,rating
0,2169,4856,5
1,1664,341,5
2,2822,899,5
3,3767,4539,3
4,4010,4816,3


In [134]:
ratings_df.to_csv(output_folder + '/ratings.csv', index=False)

In [135]:
print(len(ratings_df))
print(len(order_df))
print(len(order_items_df))
print(len(products_df))
print(len(suppliers_df))
print(len(df))

2000000
2000000
2000000
2000000
2000000
2000000
