In [1]:
import pandas as pd
import numpy as np

In [2]:
from faker import Faker
import random
import datetime
import os 

In [3]:
data_dir = os.path.join("..", "Datasets", "db")
customer_t = pd.read_csv(os.path.join(data_dir, "Customers.csv"))
location_t = pd.read_csv(os.path.join(data_dir, "Locations.csv"))
order_detail_t = pd.read_csv(os.path.join(data_dir, "Order_Details.csv"))
order_t = pd.read_csv(os.path.join(data_dir, "Orders.csv"))
product_t = pd.read_csv(os.path.join(data_dir, "Products.csv"))

fake = Faker()

## `1) Manual Generation`

### 1- customers

add column:
- customer email
- customer password (prefer encrypted)
- customer username

add new data entries (custom way):
- id 
- customer segment

In [4]:
customer_t.head()

Unnamed: 0,customer_id,customer_name,segment
0,CG-12520,Claire Gute,Consumer
1,DV-13045,Darrin Van Huff,Corporate
2,SO-20335,Sean O'Donnell,Consumer
3,BH-11710,Brosina Hoffman,Consumer
4,AA-10480,Andrew Allen,Consumer


**`Data Enrichment`**

In [5]:
emails = []
passwords = []
usernames = []
num_unique = customer_t.shape[0]

for i in range(num_unique):
    emails.append(fake.email())
    passwords.append(fake.password())
    usernames.append(fake.user_name())

emails = list(set(emails))
passwords = list(set(passwords))

while len(emails) < num_unique:
    emails.append(fake.email())
    emails = list(set(emails))
while len(passwords) < num_unique:
    passwords.append(fake.password())
    passwords = list(set(passwords))

customer_t["email"] = emails
customer_t["password"] = passwords
customer_t["username"] = usernames


print("number of supposed unique values: {}".format(customer_t.shape[0]))
print("number of unique emails: {}".format(len(emails)))
print("number of unique passwords: {}".format(len(passwords)))
print("number of unique user names: {}".format(len(usernames)))
display(customer_t.head(2))

number of supposed unique values: 793
number of unique emails: 793
number of unique passwords: 793
number of unique user names: 793


Unnamed: 0,customer_id,customer_name,segment,email,password,username
0,CG-12520,Claire Gute,Consumer,hufferic@example.org,H0DTF^lN!m,stacey06
1,DV-13045,Darrin Van Huff,Corporate,ijordan@example.net,ebl_D1k&%8,nicholasmorrison


**`data generation`**

In [6]:
# customer_id
c_code_pt_1 = 'RS'
rows_c_code_pt_1 = customer_t[customer_t["customer_id"].str.contains('RS', na=False)]
rows_c_code_pt_1_ids = rows_c_code_pt_1["customer_id"].to_list()
c_code_pt_2 = '{:05}'.format(random.randrange(1, 10**5))  # 5-digit number
new_c_code = c_code_pt_1 + "-" + c_code_pt_2
while new_c_code in rows_c_code_pt_1_ids:
    c_code_pt_2 = '{:05}'.format(random.randrange(1, 10**5))
    new_c_code = c_code_pt_1 + "-" + c_code_pt_2
print("new customer ID: {}".format(new_c_code))


# name
new_c_name = fake.name()
print('new customer name: {}'.format(new_c_name))


# customer segment
n_items = 10
customer_seg_dist = round(customer_t["segment"].value_counts() / customer_t["segment"].shape[0] * n_items)
segment_pool = []
for r, r_id  in list(zip(customer_seg_dist, customer_seg_dist.index)):
    for occurance in range(int(r)):
        segment_pool.append(r_id)
selected_index = random.randrange(0, len(segment_pool))
new_c_segment = segment_pool[selected_index]
print("segment pool item -> {}".format(new_c_segment))


# email, password, username
print("email: {} / password: {} / username: {}".format(fake.email(), fake.password(), fake.user_name()))


new customer ID: RS-83639
new customer name: Maria Mullins
segment pool item -> Corporate
email: newmanjerry@example.net / password: $hkFO6Qsn3 / username: ryanbutler


### 2- locations

add new data entries (custom way):
- country 
- city
- state
- postal_code


default values:
- region ("Unknown")
- location_id (max_id + 1)


In [7]:
location_t.head()

Unnamed: 0,country,city,state,postal_code,region,location_id
0,United States,Henderson,Kentucky,42420,South,0
1,United States,Los Angeles,California,90036,West,1
2,United States,Fort Lauderdale,Florida,33311,South,2
3,United States,Los Angeles,California,90032,West,3
4,United States,Concord,North Carolina,28027,South,4


**`data generation`**

In [8]:
from geopy.geocoders import Nominatim

In [9]:
# - processing random location
geolocator = Nominatim(user_agent="geoapiExercises")
Latitude = str(fake.latitude())
Longitude = str(fake.longitude())
lat, lng = fake.local_latlng()[0:2]
location = geolocator.reverse(lat+","+lng)
address = location.raw['address']

# city, state, country, zipcode
new_city = address.get('city', '')
new_state = address.get('state', '')
new_country = address.get('country', '')
new_zipcode = address.get('postcode')

# region
state_region_info = location_t.groupby(["state", "region"]).count()[["country"]].\
                                    rename(columns={"country": "count"}).reset_index()
def get_region(state_i):
    new_state_region_matches = state_region_info[state_region_info["state"]==state_i]
    if new_state_region_matches.shape[0] > 0:
        new_region = new_state_region_matches.sample(1).iloc[0]["region"]
    else:
        new_region = "Unknown"
    return new_region

# location id
fil_1 = location_t["country"] == new_country
fil_2 = location_t["state"] == new_state
fil_3 = location_t["postal_code"] == new_zipcode
fil_4 = location_t["city"] == new_city
location_match_num = location_t[fil_1 & fil_2 & fil_3 & fil_4].shape[0]
print("found {} matches in DB".format(location_match_num))
if location_match_num == 0:
    new_location_id = location_t["location_id"].max() + 1
    new_region = get_region(new_state)    
    new_location = (new_country, new_city, new_state, new_zipcode, new_region, new_location_id)
    print("new location entry - ID {}: {}".format(new_location_id, new_location))
else:
    new_location_id = None
    print("no new location: {}".format(new_location))


found 0 matches in DB
new location entry - ID 632: ('United States', '', 'Florida', '33573', 'South', 632)


---

### 3- orders

add new data entries (custom way):
- order_id
- ship mode
- order_date
- ship_date
- customer_id

In [10]:
order_t.head()

Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_id
0,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520
1,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045
2,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335
3,CA-2014-115812,2014-06-09,2014-06-14,Standard Class,BH-11710
4,CA-2017-114412,2017-04-15,2017-04-20,Standard Class,AA-10480


**`data generation`**

In [11]:
import datetime

In [12]:
# ship mode + order_date + ship_date
ship_modes = order_t["ship_mode"].unique()
new_ship_mode = ship_modes[random.randrange(0, len(ship_modes))]
order_date = fake.date_between(start_date='-10y', end_date='today')
if new_ship_mode == "Same Day":
    ship_date = order_date 
elif new_ship_mode == "First Class":
    ship_date = order_date + datetime.timedelta(days=random.randrange(1, 6))
elif new_ship_mode == "Second Class":
    ship_date = order_date + datetime.timedelta(days=random.randrange(7, 15))
else:
    ship_date = order_date + datetime.timedelta(days=random.randrange(16, 30))

    
# order_id
order_id_pt_1_l = list(order_t["order_id"].apply(lambda x: x.split("-")[0]).unique())
order_id_pt_2_l = list(order_t["order_id"].apply(lambda x: x.split("-")[1]).unique())
order_id_pt_3_l = list(order_t["order_id"].apply(lambda x: x.split("-")[2]).unique())
selected_index_pt1 = random.randrange(0, len(order_id_pt_1_l))
new_pt1 = order_id_pt_1_l[selected_index_pt1] # new_order_id_pt1, 
new_pt2 = str(order_date.year) # new_order_id_pt2 
random_six_digit = '{:06}'.format(random.randrange(1, 10**6))
while random_six_digit in order_id_pt_3_l:
    print('repeat')
    random_six_digit = '{:06}'.format(random.randrange(1, 10**6))
new_pt3 = random_six_digit # new_order_id_pt3
new_order_id = new_pt1 + "-" + new_pt2 + "-" + new_pt3  # new_order_id
    

# customer_id (FK)
customer_id_ind = random.randrange(0, customer_t['customer_id'].shape[0])
customer_id_selected = customer_t['customer_id'][customer_id_ind]


print("order_id: {}".format(new_order_id))
print("order_date: {} / ship_date: {} / ship_mode: {}".
                                            format(order_date, ship_date, new_ship_mode))
print("customer_id: {}".format(customer_id_selected))

order_id: CA-2018-786698
order_date: 2018-12-20 / ship_date: 2018-12-20 / ship_mode: Same Day
customer_id: GR-14560


### 4- products

add column:
- product price

**`data enrichment`**

In [13]:
order_detail_tccc = order_detail_t.copy() 
order_detail_tccc["price"] = order_detail_tccc["sales"] / order_detail_tccc["quantity"]
product_t["product_price"] = order_detail_tccc.groupby(["product_id"]).mean()["price"]
product_t.head()

  product_t["product_price"] = order_detail_tccc.groupby(["product_id"]).mean()["price"]


Unnamed: 0,product_code,product_name,category,sub_category,product_id,product_price
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases,0,103.14675
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs,1,209.416167
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,Office Supplies,Labels,2,6.683429
3,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,Furniture,Tables,3,256.804875
4,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,Office Supplies,Storage,4,12.3024


### 5- order details 

add new data entries (custom way):
- order_id, product_id, location_id
- sales, quantity, discount, profit
- order_detail_id


In [14]:
order_detail_t.head()

Unnamed: 0,order_detail_id,order_id,product_id,location_id,sales,quantity,discount,profit
0,1,CA-2016-152156,0,0,261.96,2,0.0,41.9136
1,2,CA-2016-152156,1,0,731.94,3,0.0,219.582
2,3,CA-2016-138688,2,1,14.62,2,0.0,6.8714
3,4,US-2015-108966,3,2,957.5775,5,0.45,-383.031
4,5,US-2015-108966,4,2,22.368,2,0.2,2.5164


**`data generation`**

In [15]:
# location_id, order_id, product_id
location_id_selected = new_location_id 
order_id_selected = new_order_id
product_id_selected = product_t["product_id"][random.randrange(0, product_t.shape[0])]
print("location_id: {} / order_id: {} / product_id: {}".
                  format(location_id_selected, order_id_selected, product_id_selected))


# quantity
n_items = 1000
quantity_dist = round(order_detail_t["quantity"].value_counts() / order_detail_t["quantity"].shape[0] * n_items)
col_pool = []   # make pool of choices based on probabilites
for r, r_id  in list(zip(quantity_dist, quantity_dist.index)):
    for occurance in range(int(r)):
        col_pool.append(r_id)
# select random index from pool
selected_index = random.randrange(0, len(col_pool))
new_quantity = col_pool[selected_index]
print("quantity: {}".format(new_quantity))


# sales, dicount, profit
sales_change_l = [-1000, -500, -250, -100, -50, -25, -10, -5, 1, 2, 4, 5, 8, 10, 15, 25, 37, 50, 75, 100, 150, 200, 300, 500, 1000]
new_expected_sales = float((product_t[product_t["product_id"]==product_id_selected]["product_price"] * new_quantity))
new_baseline_sales = new_expected_sales + random.choice(sales_change_l) * new_quantity
new_discount = round(1 - min(new_baseline_sales/new_expected_sales, 1), 2)
new_profit = new_baseline_sales - new_expected_sales
print("sales: {} / discount: {} / profit: {}".
                                format(new_baseline_sales, new_discount, new_profit))


# order_detail_id
new_order_detail_id = order_detail_t["order_detail_id"].max() + 1
print("order_detail_id: {}".format(new_order_detail_id))


location_id: 632 / order_id: CA-2018-786698 / product_id: 1364
quantity: 2
sales: 248.715 / discount: 0 / profit: 2.0
order_detail_id: 9995


## `2) Save Enriched Data`

In [16]:
display(product_t.head(2))
display(customer_t.head(2))

Unnamed: 0,product_code,product_name,category,sub_category,product_id,product_price
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,Furniture,Bookcases,0,103.14675
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",Furniture,Chairs,1,209.416167


Unnamed: 0,customer_id,customer_name,segment,email,password,username
0,CG-12520,Claire Gute,Consumer,hufferic@example.org,H0DTF^lN!m,stacey06
1,DV-13045,Darrin Van Huff,Corporate,ijordan@example.net,ebl_D1k&%8,nicholasmorrison


In [17]:
# SAVE
customer_t.to_csv("Datasets/db/Customers.csv", index=False)
product_t.to_csv("Datasets/db/Products.csv", index=False)

OSError: Cannot save file into a non-existent directory: 'Datasets/db'

## `3) Functional Generation`

In [None]:
customer_t = pd.read_csv("Datasets/db/Customers.csv")
location_t = pd.read_csv("Datasets/db/Locations.csv")
order_detail_t = pd.read_csv("Datasets/db/Order_Details.csv")
order_t = pd.read_csv("Datasets/db/Orders.csv")
product_t = pd.read_csv("Datasets/db/Products.csv")

### customer 

In [None]:
customer_t = pd.read_csv("Datasets/db/Customers.csv")

def generate_email():
    break_num = 100
    email = fake.email()
    i = 0 
    while email in customer_t["email"].to_list():
        email = fake.email()
        if i == break_num:
            break   
    return email

def generate_password():
    break_num = 100
    password = fake.password()
    i = 0 
    while password in customer_t["password"].to_list():
        password = fake.password()
        if i == break_num:
            break
    return password
            
def generate_username():
    break_num = 100
    username = fake.user_name()
    i = 0 
    while username in customer_t["username"].to_list():
        username = fake.user_name()
        if i == break_num:
            break
    return username

def generate_customer_id():
    c_code_pt_1 = 'RS'
    rows_c_code_pt_1 = customer_t[customer_t["customer_id"].str.contains('RS', na=False)]
    rows_c_code_pt_1_ids = rows_c_code_pt_1["customer_id"].to_list()

    c_code_pt_2 = '{:05}'.format(random.randrange(1, 10**5))  # 5-digit number

    new_c_code = c_code_pt_1 + "-" + c_code_pt_2

    while new_c_code in rows_c_code_pt_1_ids:
        c_code_pt_2 = '{:05}'.format(random.randrange(1, 10**5))
        new_c_code = c_code_pt_1 + "-" + c_code_pt_2
    return new_c_code

def generate_customer_name():
    return fake.name()

def generate_customer_segment():
    n_items = 10
    x = round(customer_t["segment"].value_counts() / customer_t["segment"].shape[0] * n_items)

    # make pool of choices based on probabilites
    segment_pool = []
    for r, r_id  in list(zip(x, x.index)):
        for occurance in range(int(r)):
            segment_pool.append(r_id)

    # select random index from pool
    selected_index = random.randrange(0, len(segment_pool))
    new_customer_segment = segment_pool[selected_index]
    return new_customer_segment

# main
def register_customer_TX():
    email = generate_email()
    password = generate_password()
    username = generate_username()

    customer_id = generate_customer_id()
    customer_name = generate_customer_name()
    customer_segment = generate_customer_segment()
    return email, password, username, customer_id, customer_name, customer_segment

In [None]:
register_customer_TX()

### locations

In [None]:
state_region_info = location_t.groupby(["state", "region"]).count()[["country"]].\
                                    rename(columns={"country": "count"}).reset_index()
def get_rand_location_data():
    geolocator = Nominatim(user_agent="geoapiExercises")
    lat, lng = fake.local_latlng()[0:2]
    location = geolocator.reverse(lat+","+lng)
    address = location.raw['address']
    
    city = address.get('city', '')
    state = address.get('state', '')
    country = address.get('country', '')
    postal_code = address.get('postcode')
    new_state_region_matches = state_region_info[state_region_info["state"]==state]
    if new_state_region_matches.shape[0] > 0:
        region = new_state_region_matches.sample(1).iloc[0]["region"]
    else:
        region = "Unknown"
    return city, state, country, postal_code, region

def verify_as_new_location(country, state, zipcode, city):
    fil_1 = location_t["country"] == country
    fil_2 = location_t["state"] == state
    fil_3 = location_t["postal_code"] == zipcode
    fil_4 = location_t["city"] == city
    location_match_num = location_t[fil_1 & fil_2 & fil_3 & fil_4].shape[0]
    if location_match_num == 0:
        return True
    return False

def generate_location_id():
    return location_t["location_id"].max() + 1

# main
def add_location_TX():
    break_num = 100
    
    # get location data
    city, state, country, postal_code, region = get_rand_location_data()
    
    # verify new location
    new_location_verifed= verify_as_new_location(city, state, country, postal_code)
    i = 0
    while not new_location_verifed:
        city, state, country, postal_code = get_rand_location_data()
        new_location_verifed = verify_as_new_location(city, state, country, postal_code)
        if i == break_num:
            break
    
    # generate location ID
    location_id = generate_location_id()

    return country, city, state, postal_code, region, location_id

In [18]:
add_location_TX()

NameError: name 'add_location_TX' is not defined

### orders + order details

In [19]:
import datetime

In [20]:
def make_order_date():
    return fake.date_between(start_date='-10y', end_date='today')

def generate_ship_info(order_date):
    ship_modes = order_t["ship_mode"].unique()
    new_ship_mode = ship_modes[random.randrange(0, len(ship_modes))]

    if new_ship_mode == "Same Day":
        ship_date = order_date 
    elif new_ship_mode == "First Class":
        ship_date = order_date +  datetime.timedelta(days=random.randrange(1, 6))
    elif new_ship_mode == "Second Class":
        ship_date = order_date +  datetime.timedelta(days=random.randrange(7, 15))
    else:
        ship_date = order_date + datetime.timedelta(days=random.randrange(16, 30))

    return new_ship_mode, ship_date

def generate_order_id(date):
    # order_id
    order_id_pt_1_l = list(order_t["order_id"].apply(lambda x: x.split("-")[0]).unique())
    order_id_pt_2_l = list(order_t["order_id"].apply(lambda x: x.split("-")[1]).unique())
    order_id_pt_3_l = list(order_t["order_id"].apply(lambda x: x.split("-")[2]).unique())

    selected_index_pt1 = random.randrange(0, len(order_id_pt_1_l))
    new_pt1 = order_id_pt_1_l[selected_index_pt1]

    new_pt2 = str(date.year)

    random_six_digit = '{:06}'.format(random.randrange(1, 10**6))
    while random_six_digit in order_id_pt_3_l:
        print('repeat')
        random_six_digit = '{:06}'.format(random.randrange(1, 10**6))
    new_pt3 = random_six_digit

    new_order_id = new_pt1 + "-" + new_pt2 + "-" + new_pt3
    return new_order_id

def get_customer_id():
    customer_id_ind = random.randrange(0, customer_t['customer_id'].shape[0])
    customer_id_selected = customer_t['customer_id'][customer_id_ind]
    return customer_id_selected

# main
def order_TX():
    order_date = make_order_date()
    ship_mode, ship_date = generate_ship_info(order_date)
    order_id = generate_order_id(date=order_date)
    customer_id = get_customer_id()
    return order_date, ship_date, ship_mode, order_id, customer_id
    

In [21]:
def get_related_ids():
    location_id_selected = location_t["location_id"][random.randrange(0, location_t.shape[0])]
    product_id_selected = product_t["product_id"][random.randrange(0, product_t.shape[0])]
    
    return location_id_selected, product_id_selected

def get_sales_data(product_id_selected):
    ## quantity
    n_items = 1000
    x = round(order_detail_t["quantity"].value_counts() / order_detail_t["quantity"].shape[0] * n_items)
    # make pool of choices based on probabilites
    col_pool = []
    for r, r_id  in list(zip(x, x.index)):
        for occurance in range(int(r)):
            col_pool.append(r_id)
    selected_index = random.randrange(0, len(col_pool))
    new_quantity = col_pool[selected_index]
    
    sales_change_l = [-1000, -500, -250, -100, -50, -25, -10, -5, 1, 2, 4, 5, 8, 
                      10, 15, 25, 37, 50, 75, 100, 150, 200, 300, 500, 1000]
    new_expected_sales = float((product_t[product_t["product_id"]==product_id_selected]["product_price"] * new_quantity))
    new_baseline_sales = new_expected_sales + random.choice(sales_change_l) * new_quantity
    new_discount = round(1 - min(new_baseline_sales/new_expected_sales, 1), 2)
    new_profit = new_baseline_sales - new_expected_sales 

    return new_quantity, new_baseline_sales, new_discount, new_profit

def generate_order_detail_id():
    return order_detail_t["order_detail_id"].max() + 1


def order_detail_TX(order_id):
    location_id, product_id = get_related_ids()
    quantity, sales, discount, profit = get_sales_data(product_id_selected=product_id)
    order_detail_id = generate_order_detail_id()
    
    return order_detail_id, order_id, product_id, location_id, sales, quantity, discount, profit

In [22]:
order_date, ship_date, ship_mode, order_id, customer_id = order_TX()
print("ORDER ID: {}".format(order_id))
print("-  ", end="")
print((order_date, ship_date, ship_mode, order_id, customer_id))
print()

order_detail_num = random.randrange(1, 5)
print('order detail --> {} entries'.format(order_detail_num))
for i in range(order_detail_num):
    print("-  ", end="")
    print(order_detail_TX(order_id))


ORDER ID: CA-2014-283659
-  (datetime.date(2014, 9, 10), datetime.date(2014, 9, 30), 'Standard Class', 'CA-2014-283659', 'SJ-20125')

order detail --> 4 entries
-  (9995, 'CA-2014-283659', 1857, 371, -1913.088, 8, 23.01, -2000.0)
-  (9995, 'CA-2014-283659', 603, 357, 1465.331, 7, 0, 1400.0)
-  (9995, 'CA-2014-283659', 1217, 120, 185.79166666666666, 5, 0, 125.0)
-  (9995, 'CA-2014-283659', 692, 180, 63.2148, 3, 0.32, -30.0)
