In [2]:
import numpy as np
import pandas as pd

In [3]:
LENGTH = 1000000 

## Generate Custom Dataset

In [9]:
governments_names = pd.read_csv('Seeds/governments.csv')["governorate_name_en"].values

In [7]:
available_names = pd.read_csv('Seeds/names.csv')['name'].unique()

names = np.random.choice(available_names, LENGTH)

IDs = np.arange(1, LENGTH + 1)
governments = np.random.choice(governments_names, LENGTH)
ages = np.random.randint(18, 81, LENGTH)
gender = np.random.choice([0, 1], LENGTH)

In [23]:
df = pd.DataFrame({
    'ID': IDs,
    'governorate': governments,
    'age': ages,
    'name': names,
    'gender': gender})

df.head()

Unnamed: 0,ID,governorate,age,name,gender
0,1,Giza,52,Todd Oneal,0
1,2,Sohag,74,Mrs. Courtney Martin,0
2,3,Fayoum,57,John Stone,1
3,4,Qaliubiya,30,Jennifer Farrell,1
4,5,Matrouh,60,Daniel Hall,1


In [25]:
df.to_csv('Data/customer.csv', index=False)

## Generate Suppliers Dataset

In [91]:
suppliers_id = np.arange(1, 10001)
suppliers_names = np.random.choice(available_names, 10000)

In [92]:
suppliers_df = pd.DataFrame({
    'ID': suppliers_id,
    'name': suppliers_names})

suppliers_df.head()

Unnamed: 0,ID,name
0,1,Ann Snyder
1,2,Mr. Andrew Mcdonald III
2,3,Shannon Cook
3,4,Karen Santiago
4,5,Megan Krueger


In [93]:
suppliers_df.to_csv('Data/suppliers.csv', index=False)

## Generate Products Dataset

In [94]:
products_available_names = pd.read_csv('Data/products.csv')["product_name"].unique()


products_IDs = np.arange(1, LENGTH + 1)
products_names = np.random.choice(products_available_names, LENGTH)
products_prices = np.random.randint(100, 10001, LENGTH)
products_categories = np.random.randint(1,  5001, LENGTH)
cid = np.random.randint(1, 10001, LENGTH)



In [95]:
products_df = pd.DataFrame({
    'ID': products_IDs,
    'product_name': products_names,
    'price': products_prices,
    'category': products_categories,
    'supplier_id': cid})

products_df.head()

Unnamed: 0,ID,product_name,price,category,supplier_id
0,1,Nutri-Grain Crunch Strawberry Parfait Breakfas...,1934,4288,6779
1,2,Thermal Creations Heat Tamer Leave-In Spray,7999,2326,6239
2,3,Frozen Greek Yogurt Bars With Toffee & Caramel...,5368,1513,2505
3,4,All Natural Cheese Deli Slices - Swiss/Colby/C...,5907,427,3056
4,5,Light White Rice Loaf,7642,4662,8544


In [96]:
# shuffle the data
products_df = products_df.sample(frac=1).reset_index(drop=True)

products_df.to_csv('Data/products.csv', index=False)

## Generate Orders Dataset

In [76]:
order_ids = np.arange(1, LENGTH + 1)
customer_ids = np.random.randint(1, LENGTH + 1, LENGTH)
available_dates = pd.date_range(start='1/1/2010', end='1/1/2021')
# change the format of the dates to be like 2020-01-01
available_dates = [str(date).split()[0] for date in available_dates]
orders_dates = np.random.choice(available_dates, LENGTH)
orders_time = np.random.randint(0, 24, LENGTH)

In [79]:
order_df = pd.DataFrame({
    'ID': order_ids,
    'customer_id': customer_ids,
    'date': orders_dates,
    'hour': orders_time})

order_df.head(10)

Unnamed: 0,ID,customer_id,date,hour
0,1,452894,2017-09-04,6
1,2,704430,2017-03-22,14
2,3,429175,2013-07-23,1
3,4,470052,2018-04-07,9
4,5,679263,2016-02-20,1
5,6,66271,2020-06-19,17
6,7,349714,2014-12-16,9
7,8,810783,2012-09-12,18
8,9,200602,2019-05-13,19
9,10,406556,2012-05-23,9


In [82]:
# shuffle the data
order_df = order_df.sample(frac=1).reset_index(drop=True)

order_df.to_csv('Data/orders.csv', index=False)

In [84]:
oid = np.random.choice(order_ids, LENGTH)
pid = np.random.choice(products_IDs, LENGTH)
quantity = np.random.randint(1, 100, LENGTH)

In [86]:
order_items_df = pd.DataFrame({
    'order_id': oid,
    'product_id': pid,
    'quantity': quantity})

order_items_df.head()

Unnamed: 0,order_id,product_id,quantity
0,361298,668799,10
1,148982,766232,46
2,439528,936138,84
3,701935,131536,37
4,718555,746586,69


In [87]:
order_items_df.to_csv('Data/order_items.csv', index=False)

## Generate Ratings Dataset

In [16]:
import itertools

customers_ids = np.random.choice(np.arange(1, LENGTH + 1), 2000)
customers_ids = np.unique(customers_ids)

products_ids = np.random.choice(np.arange(1, LENGTH + 1), 2000)
products_ids = np.unique(products_ids)

# get all the possible combinations of the customers ids and the products ids
customers_products = list(itertools.product(customers_ids, products_ids))

# shuffle the data
np.random.shuffle(customers_products)

In [17]:
customers_products = customers_products[:LENGTH]

In [18]:
ratings = np.random.randint(1, 6, LENGTH)
cid = [x[0] for x in customers_products]
pid = [x[1] for x in customers_products]


In [20]:
ratings_df = pd.DataFrame({
    'product_id': pid,
    'customer_id': cid,
    'rating': ratings})

ratings_df.head()

Unnamed: 0,product_id,customer_id,rating
0,981665,610090,4
1,980389,163175,4
2,98275,114595,2
3,432819,5618,3
4,189251,534728,2


In [90]:
ratings_df.to_csv('Data/ratings.csv', index=False)