In [55]:
import numpy as np
import pandas as pd
import os

In [56]:
LENGTH = 100000
output_folder = f'data_{LENGTH}/'

In [57]:
# if output_folder does not exist, create it
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

## Generate Customers Dataset

In [58]:
governments_names = pd.read_csv('Seeds/governments.csv')["governorate_name_en"].values
available_names = pd.read_csv('Seeds/names.csv')['name'].unique()

In [59]:
names = np.random.choice(available_names, LENGTH)

IDs = np.arange(1, LENGTH + 1)
governments = np.random.choice(governments_names, LENGTH)
ages = np.random.randint(18, 81, LENGTH)
gender = np.random.choice([0, 1], LENGTH)

In [60]:
df = pd.DataFrame({
    'ID': IDs,
    'governorate': governments,
    'age': ages,
    'name': names,
    'gender': gender})

df.head()

Unnamed: 0,ID,governorate,age,name,gender
0,1,Luxor,34,Nicholas Jacobson,1
1,2,Matrouh,61,Jasmine Barrera,0
2,3,Suez,56,Angela Taylor,0
3,4,New Valley,67,Suzanne Wallace,0
4,5,Luxor,52,Kevin Green,1


In [61]:
df.to_csv(output_folder + '/customers.csv', index=False)

## Generate Suppliers Dataset

In [62]:
suppliers_id = np.arange(1, LENGTH+1)
suppliers_names = np.random.choice(available_names, LENGTH)

# make supplier name unique
suppliers_names = suppliers_names + suppliers_id.astype(str)

In [63]:
suppliers_df = pd.DataFrame({
    'ID': suppliers_id,
    'name': suppliers_names})

suppliers_df.head()

Unnamed: 0,ID,name
0,1,Todd Summers1
1,2,Michael Gomez2
2,3,Kimberly Odom3
3,4,Bernard Pennington4
4,5,Lisa Hunter5


In [64]:
suppliers_df.to_csv(output_folder + './suppliers.csv', index=False)

## Generate Products Dataset

In [65]:
products_available_names = pd.read_csv('Seeds/products.csv')["product_name"].unique()


products_IDs = np.arange(1, LENGTH + 1)
products_names = np.random.choice(products_available_names, LENGTH)
products_prices = np.random.randint(100, 10001, LENGTH)
products_categories = np.random.randint(1,  5001, LENGTH)
cid = np.random.randint(1, 10001, LENGTH)



In [66]:
products_df = pd.DataFrame({
    'ID': products_IDs,
    'product_name': products_names,
    'price': products_prices,
    'category': products_categories,
    'supplier_id': cid})

products_df.head()

Unnamed: 0,ID,product_name,price,category,supplier_id
0,1,Pinot Noir Rose,7944,3448,6538
1,2,Pet-Ritz Regular Pie Crusts,2790,2362,3849
2,3,No Sugar Added Apple Pie,8870,4602,2607
3,4,Soy & Dairy Free Plain Unsweetened Almond Milk...,5275,2912,1309
4,5,Protein Granola Apple Crisp,3207,2237,1166


In [67]:
# shuffle the data
products_df = products_df.sample(frac=1).reset_index(drop=True)

products_df.to_csv(output_folder + '/products.csv', index=False)

## Generate Orders Dataset

In [68]:
order_ids = np.arange(1, LENGTH + 1)
customer_ids = np.random.randint(1, LENGTH + 1, LENGTH)
available_dates = pd.date_range(start='1/1/2010', end='1/1/2021')
# change the format of the dates to be like 2020-01-01
available_dates = [str(date).split()[0] for date in available_dates]
orders_dates = np.random.choice(available_dates, LENGTH)
orders_time = np.random.randint(0, 24, LENGTH)

In [69]:
order_df = pd.DataFrame({
    'ID': order_ids,
    'customer_id': customer_ids,
    'date': orders_dates,
    'hour': orders_time})

order_df.head(10)

Unnamed: 0,ID,customer_id,date,hour
0,1,65663,2010-04-14,12
1,2,23651,2017-06-18,0
2,3,20710,2015-10-24,4
3,4,38809,2013-04-13,13
4,5,42997,2016-04-09,21
5,6,60572,2017-04-16,0
6,7,64032,2011-05-31,9
7,8,8799,2019-10-24,20
8,9,28909,2020-01-12,11
9,10,45055,2015-06-13,8


In [70]:
# shuffle the data
order_df = order_df.sample(frac=1).reset_index(drop=True)

order_df.to_csv(output_folder + '/orders.csv', index=False)

In [71]:
oid = np.random.choice(order_ids, LENGTH)
pid = np.random.choice(products_IDs, LENGTH)
quantity = np.random.randint(1, 100, LENGTH)

In [72]:
order_items_df = pd.DataFrame({
    'order_id': oid,
    'product_id': pid,
    'quantity': quantity})

order_items_df.head()

Unnamed: 0,order_id,product_id,quantity
0,40940,55315,53
1,69606,23797,57
2,19227,16220,15
3,49716,79854,30
4,88689,98635,19


In [73]:
order_items_df.to_csv(output_folder + '/order_items.csv', index=False)

## Generate Ratings Dataset

In [74]:
import itertools

customers_ids = np.random.choice(np.arange(1, LENGTH + 1), 2000)
customers_ids = np.unique(customers_ids)

products_ids = np.random.choice(np.arange(1, LENGTH + 1), 2000)
products_ids = np.unique(products_ids)

# get all the possible combinations of the customers ids and the products ids
customers_products = list(itertools.product(customers_ids, products_ids))

# shuffle the data
np.random.shuffle(customers_products)

In [75]:
customers_products = customers_products[:LENGTH]

In [76]:
ratings = np.random.randint(1, 6, LENGTH)
cid = [x[0] for x in customers_products]
pid = [x[1] for x in customers_products]


In [77]:
ratings_df = pd.DataFrame({
    'product_id': pid,
    'customer_id': cid,
    'rating': ratings})

ratings_df.head()

Unnamed: 0,product_id,customer_id,rating
0,52463,10469,2
1,37705,36574,5
2,69988,61572,1
3,3200,78961,3
4,64164,15665,3


In [78]:
ratings_df.to_csv(output_folder + '/ratings.csv', index=False)