In [1]:
from faker import Faker
from faker.providers import internet
import pandas as pd
import numpy as np

faker = Faker(['en_AU'])
faker.add_provider(internet)

In [2]:
branches_cities = [faker.city() for _ in range(10)]
branches_states = [faker.state() for _ in range(10)]
branches_managers = [faker.name() for _ in range(10)]
branches_phones = [faker.phone_number() for _ in range(10)]
branches_emails = [f"{branches_manager.replace('.','').replace(' ','.')}@{faker.domain_name()}" for branches_manager in branches_managers]

# convert to dataframe
branches_df = pd.DataFrame({
    'branch_id': range(1,11),
    'branch_city': branches_cities,
    'branch_state': branches_states,
    'branch_manager': branches_managers,
    'branch_phone': branches_phones,
    'branch_email': branches_emails
})
branches_df.head()

Unnamed: 0,branch_id,branch_city,branch_state,branch_manager,branch_phone,branch_email
0,1,Lake Scott,New South Wales,Robert Brown,(08).7061.7897,Robert.Brown@jordan.edu
1,2,Hooverfurt,South Australia,Manuel Thomas,6791 9327,Manuel.Thomas@gomez.edu
2,3,Riverashire,Tasmania,Jessica Ramirez,94669518,Jessica.Ramirez@travis.edu.au
3,4,New Tommy,Victoria,Amy Guerrero,+61-8-4976-1941,Amy.Guerrero@hill-ferguson.net.au
4,5,Martinton,Western Australia,Derek Rose,+61 461 239 340,Derek.Rose@gonzales.edu


In [3]:
columns_dict = {
    "name": faker.name
}
for i in range(2):
    print(columns_dict["name"]())

Monica Mcbride
Karen Walker


In [4]:
branch_df_column_definition  = {
    "branch_name": faker.city,
    "branch_address": faker.address,
    "manager_name": faker.name,
    "phone_number": faker.phone_number,
}

def fake_df_from_column_definition(column_definition:dict, num_rows:int = 10):
    df = pd.DataFrame(columns = column_definition.keys())
    for i in range(num_rows):
        computed_dict = {}
        for column in column_definition.keys():
            computed_dict[column] = column_definition[column]()
        df.loc[i] = computed_dict
    return df

In [5]:
branch_df = fake_df_from_column_definition(branch_df_column_definition, 20)

In [6]:
sales_rep_df_column_definition  = {
    "sales_rep_name": faker.name,
    "sales_rep_id": lambda: faker.unique.random_int(min=100000, max=9999999),
    "branch_name": lambda: branch_df.sample()["branch_name"].values[0],
}
num_sales_reps = 100

sales_rep_df = fake_df_from_column_definition(sales_rep_df_column_definition, num_sales_reps)
sales_rep_df

Unnamed: 0,sales_rep_name,sales_rep_id,branch_name
0,Jeffrey Kennedy,7518239,Bensonhaven
1,Michelle Murillo,2921072,South Normaland
2,Kerry Kerr,8728349,Greenborough
3,Crystal Lucero,8973986,Evanshaven
4,Mrs. Brittany Walker,1372412,Bensonhaven
...,...,...,...
95,Derek Adams,6226897,Lake Billy
96,Monique Parks,3959447,West William
97,Karen Solomon,5975265,Nathanmouth
98,Gary Cruz,5910505,Christineland


In [7]:
products_category_df_column_definition  = {
    "product_category": faker.word,
    "product_category_description": faker.text,
}

num_product_categories = 10

products_category_df = fake_df_from_column_definition(products_category_df_column_definition, num_product_categories)


In [8]:
products_category_df

Unnamed: 0,product_category,product_category_description
0,ipsum,Deserunt animi maiores quis ratione magnam min...
1,sunt,Nisi iure repudiandae vitae. Modi provident co...
2,cumque,Dolorum deserunt earum dignissimos voluptate m...
3,hic,Et esse quam vitae est. Pariatur aliquam vero ...
4,qui,Maxime tempore laborum quaerat qui numquam. Be...
5,necessitatibus,Quos doloremque perspiciatis cumque sequi cumq...
6,ullam,Voluptatem aliquam saepe voluptas. Et possimus...
7,dolore,Pariatur suscipit minima alias soluta. Accusan...
8,quia,Deleniti laborum veritatis assumenda id. Ut qu...
9,perferendis,Unde numquam eligendi in maxime quo.\nNon quod...


In [9]:
num_products = 100

product_df_column_definition  = {
    "product_id": faker.unique.pyint,
    "product_price": lambda: faker.pyfloat(left_digits=2, right_digits=2, positive=True),
    "product_expiry_date": lambda: faker.date_between(start_date='+1m', end_date='+2y'),
}

product_df = fake_df_from_column_definition(product_df_column_definition, num_products)
product_df

Unnamed: 0,product_id,product_price,product_expiry_date
0,1964,38.96,2024-02-12
1,5305,34.64,2024-09-24
2,3470,55.60,2024-10-12
3,8876,55.87,2025-11-20
4,5468,61.44,2024-08-08
...,...,...,...
95,1215,74.43,2025-10-07
96,3945,65.13,2025-10-30
97,7566,32.86,2025-07-22
98,9462,17.28,2025-08-06


In [10]:
num_customers = 1000

customer_df_column_definition  = {
    "customer_id": faker.unique.pyint,
    "customer_name": faker.name,
    "customer_email": faker.email,
    "customer_phone_number": faker.phone_number,
}

customer_df = fake_df_from_column_definition(customer_df_column_definition, num_customers)

num_sales_data = 100000

sales_data_df_column_definition  = {
    "sales_id": faker.unique.uuid4,
    "sales_rep_id": lambda: sales_rep_df.sample()["sales_rep_id"].values[0],
    "customer_id": lambda: customer_df.sample()["customer_id"].values[0],
    "product_id": lambda: product_df.sample()["product_id"].values[0],
    "quantity": lambda: faker.pyint(min_value=1, max_value=10),
    "date_of_sale": lambda: faker.date_between(start_date='-1y', end_date='today'),
}

sales_data_df = fake_df_from_column_definition(sales_data_df_column_definition, num_sales_data)

In [11]:
sales_data_df

Unnamed: 0,sales_id,sales_rep_id,customer_id,product_id,quantity,date_of_sale
0,fb7353d8-8140-4680-9d8d-b45aa5a66d2a,7290278,6469,7724,3,2023-08-26
1,9fadfd49-3e1b-482c-9510-da413f349c95,3959447,4165,3903,1,2023-05-11
2,5c1c19d3-2c0e-4c52-ab30-6211d387515e,547362,9795,3804,8,2023-10-19
3,4a9f0664-e32c-4d86-9bf0-42fe80558516,9376631,275,3028,9,2023-06-08
4,585d74aa-e399-4ff9-99f7-20018096f5c5,4778864,2360,7416,4,2023-04-11
...,...,...,...,...,...,...
99995,b0b782ee-1543-4e60-a2c6-44b54e4a82f2,4562608,8880,8891,6,2023-05-03
99996,87f5b5d9-524e-4e4e-9a48-3c02c9b91458,9360609,3761,3331,4,2023-12-10
99997,ae80b082-feed-406a-84ef-e2a6fd10d800,2248252,4083,2405,5,2023-02-15
99998,398ee202-cded-43fb-85af-00762889930a,2221365,304,490,7,2023-08-27


In [12]:
# save all dataframes to csv
branches_df.to_csv("branches_old.csv", index=False)
branch_df.to_csv("branches.csv", index=False)
sales_rep_df.to_csv("sales_rep.csv", index=False)  
products_category_df.to_csv("products_category.csv", index=False)
product_df.to_csv("product.csv", index=False)
customer_df.to_csv("customer.csv", index=False)
sales_data_df.to_csv("sales_data.csv", index=False)
