In [1]:
from faker import Faker

In [2]:
faker = Faker()
for _ in range(10):
    print(f"Datetime between: {faker.date_between(start_date='-18y', end_date='now')}")

Datetime between: 2010-11-16
Datetime between: 2010-08-01
Datetime between: 2006-11-05
Datetime between: 2009-08-19
Datetime between: 2014-12-12
Datetime between: 2021-08-21
Datetime between: 2010-01-21
Datetime between: 2020-07-02
Datetime between: 2021-01-03
Datetime between: 2006-05-24


## Payment Table

In [3]:
import random
import numpy as np

paymentmethods = ["Mastercard", "iDeal", "Visa", "PayPal", "Klarna"]

#Generate random payment method data, k = the number of orders
paymentdata = random.choices(paymentmethods, weights=(40, 10, 8, 24, 18), k=10)

print(paymentdata)

['iDeal', 'Mastercard', 'iDeal', 'Mastercard', 'iDeal', 'Mastercard', 'Mastercard', 'Klarna', 'PayPal', 'Mastercard']


### Transforming to dataframe in Pandas

In [4]:
import pandas as pd
paymentdf = pd.DataFrame (paymentmethods, columns = ['Payment_Method'])
paymentdf.head()

Unnamed: 0,Payment_Method
0,Mastercard
1,iDeal
2,Visa
3,PayPal
4,Klarna


In [5]:
#paymentdf.index += 1

### Add new column with payment terms for every payment method

In [6]:
#Create list of conditions
conditions = [
    (paymentdf['Payment_Method'] == 'Mastercard'),
    (paymentdf['Payment_Method'] == 'iDeal'),
    (paymentdf['Payment_Method'] == 'Visa'),
    (paymentdf['Payment_Method'] == 'PayPal'),
    (paymentdf['Payment_Method'] == 'Klarna')
    ]

#Create list of values that will be assigned to the conditions
values = ['End of month', 'Debit', 'End of month', 'Debit', 'Net 14']

#Create new column with values
paymentdf['Payment_Terms'] = np.select(conditions, values)

paymentdf.head(10)

Unnamed: 0,Payment_Method,Payment_Terms
0,Mastercard,End of month
1,iDeal,Debit
2,Visa,End of month
3,PayPal,Debit
4,Klarna,Net 14


In [7]:
paymentdf.to_csv('Payments.csv')

## Store Table

In [8]:
stores = [['Amsterdam','Oudezijds Voorburgwal', 'Netherlands'],
['Stockholm','Drottninggatan', 'Sweden'],
['London','Carnaby Street', 'United Kingdom'],
['Warsaw','Nowy Świat', 'Poland'],
['Rome','Via del Corso', 'Italy'],
['Madrid','Calle Preciados', 'Spain'],
['Webshop','http://www.sneaker-factory.com']
         ]


#Generate random store data, k = the number of orders
storedata = random.choices(stores, weights=(18, 8, 16, 5, 11, 7, 35), k=10)

print(storedata)

[['Webshop', 'http://www.sneaker-factory.com'], ['Amsterdam', 'Oudezijds Voorburgwal', 'Netherlands'], ['Amsterdam', 'Oudezijds Voorburgwal', 'Netherlands'], ['Rome', 'Via del Corso', 'Italy'], ['Webshop', 'http://www.sneaker-factory.com'], ['Webshop', 'http://www.sneaker-factory.com'], ['Amsterdam', 'Oudezijds Voorburgwal', 'Netherlands'], ['London', 'Carnaby Street', 'United Kingdom'], ['London', 'Carnaby Street', 'United Kingdom'], ['Madrid', 'Calle Preciados', 'Spain']]


In [9]:
storedf = pd.DataFrame (stores, columns = ['Store_Location', 'Address', 'Country'])

storedf.head(10)

Unnamed: 0,Store_Location,Address,Country
0,Amsterdam,Oudezijds Voorburgwal,Netherlands
1,Stockholm,Drottninggatan,Sweden
2,London,Carnaby Street,United Kingdom
3,Warsaw,Nowy Świat,Poland
4,Rome,Via del Corso,Italy
5,Madrid,Calle Preciados,Spain
6,Webshop,http://www.sneaker-factory.com,


In [10]:
#storedf.index += 1
storedf.index
storedf.head(10)

Unnamed: 0,Store_Location,Address,Country
0,Amsterdam,Oudezijds Voorburgwal,Netherlands
1,Stockholm,Drottninggatan,Sweden
2,London,Carnaby Street,United Kingdom
3,Warsaw,Nowy Świat,Poland
4,Rome,Via del Corso,Italy
5,Madrid,Calle Preciados,Spain
6,Webshop,http://www.sneaker-factory.com,


In [11]:
storedf.to_csv('Stores.csv')

## Discount Table

In [12]:
import datetime

faker = Faker()
discountlist = [] 
for _ in range(20):
    discountlist.append((faker.date_between(start_date='-18y', end_date='now')).strftime("%d %b"))
    
print(discountlist)

['17 Oct', '07 Nov', '16 Jul', '26 Jul', '01 Nov', '18 Feb', '14 May', '06 Aug', '21 Jul', '26 Mar', '08 Mar', '30 Aug', '16 Jan', '13 Jun', '19 Jan', '31 Mar', '14 Apr', '18 Aug', '29 May', '15 Mar']


In [13]:
for i in discountlist:
    if int(i[:2]) <= 7 and str(i[-3:]) == "Jan":
        print("New Year discount")
    elif i[-3:] == "Dec":
        print("Christmas discount")
    elif (int(i[:2]) >= 21 and str(i[-3:]) == "Mar") or (int(i[:2]) <= 3 and str(i[-3:]) == "Apr"):
        print("Spring deal")
    elif int(i[:2]) == 22 and str(i[-3:]) == "Apr":
        print("Earth day")
    else:
        print("No discount")

No discount
No discount
No discount
No discount
No discount
No discount
No discount
No discount
No discount
Spring deal
No discount
No discount
No discount
No discount
No discount
Spring deal
No discount
No discount
No discount
No discount


In [14]:
discount = [['Christmas', '20 Dec', '24 Dec', 15, 0],
            ['New Year', '02 Jan', '05 Jan', 0, 20],
            ['Earth Day', '22 Apr', '22 Apr', 10, 0],
            ['Spring deal', '18 Mar', '22 Mar', 10, 0]
           ]




## Product table

In [15]:
import itertools
import pandas as pd

size = []
for i in range(36,49):
    size.append(i)

color = ['Crimson', 'Navy', 'Olive', 'Baby Blue',
         'Cyan', 'Coral', 'Azure', 'Denim', 'Peach'
        ]

#A nested list with all product specifications is used to find all the possible permutations 
productcategories = [['SWEATZY_MEN','SWEATZY_WOMEN',
               'WOVE_MEN','WOVE_WOMEN',
               'BOOTWEAR_MEN','BOOTWEAR_WOMEN',
               'FROVE_MEN','FROVE_WOMEN',
               'SWOONIX_MEN','SWOONIX_WOMEN'
              ],
               ['Crimson', 'Navy', 'Olive', 'Baby Blue',
                'Cyan', 'Coral', 'Azure', 'Denim', 'Peach'],
               size
              ]

### Listing all permutations for products 

In [16]:
products = list(itertools.product(*productcategories))

productsdf = pd.DataFrame (products, columns = ['Product_Name', 'Product_Color', 'Product_Size'])
productsdf.head(5)


Unnamed: 0,Product_Name,Product_Color,Product_Size
0,SWEATZY_MEN,Crimson,36
1,SWEATZY_MEN,Crimson,37
2,SWEATZY_MEN,Crimson,38
3,SWEATZY_MEN,Crimson,39
4,SWEATZY_MEN,Crimson,40


In [17]:
productsdf.shape

(1170, 3)

In [18]:
kidssize = []
for i in range(26, 39):
    kidssize.append(i)

kidscategories = [['SWEATZY_KIDS', 'WOVE_KIDS', 'BOOTWEAR_KIDS', 'FROVE_KIDS', 'SWOONIX_KIDS'],
                ['Crimson', 'Navy', 'Olive', 'Baby Blue',
                 'Cyan', 'Coral', 'Azure', 'Denim', 'Peach'],
                kidssize
               ]
print(kidscategories)


[['SWEATZY_KIDS', 'WOVE_KIDS', 'BOOTWEAR_KIDS', 'FROVE_KIDS', 'SWOONIX_KIDS'], ['Crimson', 'Navy', 'Olive', 'Baby Blue', 'Cyan', 'Coral', 'Azure', 'Denim', 'Peach'], [26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]]


### Listing all permutations for the kids products

In [19]:
import itertools
kidsproducts = list(itertools.product(*kidscategories))


In [20]:

kidsproductsdf = pd.DataFrame (kidsproducts, columns = ['Product_Name', 'Product_Color', 'Product_Size'])
kidsproductsdf.head(5)
kidsproductsdf.shape

(585, 3)

In [21]:
kidsproductsdf.describe()

Unnamed: 0,Product_Size
count,585.0
mean,32.0
std,3.744859
min,26.0
25%,29.0
50%,32.0
75%,35.0
max,38.0


### Appending kids dataframe to other dataframe to create one products dataframe

In [22]:
productsdf = productsdf.append(kidsproductsdf)
productsdf.describe()


Unnamed: 0,Product_Size
count,1755.0
mean,38.666667
std,6.020205
min,26.0
25%,35.0
50%,39.0
75%,44.0
max,48.0


### Conditions for product price of different shoes

In [23]:
def productprice (row):
    if row['Product_Name'] == 'SWEATZY_KIDS' :
        return 39.95
    if row['Product_Name'] == 'SWEATZY_MEN' or row['Product_Name'] == 'SWEATZY_WOMEN':
        return 49.95
    if row['Product_Name'] == 'WOVE_KIDS' :
        return 49.95
    if row['Product_Name'] == 'WOVE_MEN' or row['Product_Name'] == 'WOVE_WOMEN' :
        return 59.95
    if row['Product_Name'] == 'BOOTWEAR_KIDS' :
        return 64.95
    if row['Product_Name'] == 'BOOTWEAR_MEN' or row['Product_Name'] == 'BOOTWEAR_WOMEN' :
        return 74.95
    if row['Product_Name'] == 'FROVE_KIDS' :
        return 49.95
    if row['Product_Name'] == 'FROVE_MEN' or row['Product_Name'] == 'FROVE_WOMEN' :
        return 59.95
    if row['Product_Name'] == 'SWOONIX_KIDS' :
        return 54.95
    if row['Product_Name'] == 'SWOONIX_MEN' or row['Product_Name'] == 'SWOONIX_WOMEN' :
        return 64.95


In [24]:
productsdf.apply (lambda row: productprice(row), axis=1)

0      49.95
1      49.95
2      49.95
3      49.95
4      49.95
       ...  
580    54.95
581    54.95
582    54.95
583    54.95
584    54.95
Length: 1755, dtype: float64

In [25]:
productsdf['Product_Price'] = productsdf.apply (lambda row: productprice(row), axis=1)
productsdf.head(1700)

Unnamed: 0,Product_Name,Product_Color,Product_Size,Product_Price
0,SWEATZY_MEN,Crimson,36,49.95
1,SWEATZY_MEN,Crimson,37,49.95
2,SWEATZY_MEN,Crimson,38,49.95
3,SWEATZY_MEN,Crimson,39,49.95
4,SWEATZY_MEN,Crimson,40,49.95
...,...,...,...,...
525,SWOONIX_KIDS,Cyan,31,54.95
526,SWOONIX_KIDS,Cyan,32,54.95
527,SWOONIX_KIDS,Cyan,33,54.95
528,SWOONIX_KIDS,Cyan,34,54.95


### Probabilities of shoe sizes

#### Men

In [28]:
sizes = []
for i in range(35,50):
    sizes.append(i)
print(sizes)

[35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [29]:
import scipy.stats as st

u = 43.00
o = 2.00

Zscores = []
for i in sizes:
    Z = ((i+0.5)-u)/o
    Zscores.append(Z)

print(Zscores)

[-3.75, -3.25, -2.75, -2.25, -1.75, -1.25, -0.75, -0.25, 0.25, 0.75, 1.25, 1.75, 2.25, 2.75, 3.25]


In [30]:
pvalues = []

for i in Zscores:
    p = st.norm.cdf(i)
    pvalues.append(p)
    
print(pvalues)

[8.841728520080377e-05, 0.0005770250423907659, 0.002979763235054555, 0.012224472655044696, 0.040059156863817086, 0.10564977366685535, 0.2266273523768682, 0.4012936743170763, 0.5987063256829237, 0.7733726476231317, 0.8943502263331446, 0.9599408431361829, 0.9877755273449553, 0.9970202367649454, 0.9994229749576092]


In [31]:
sizeprobability = []
for i in range(len(pvalues)):
    p_upper = pvalues[i]
    p_lower = pvalues[i-1]
    p = p_upper - p_lower
    sizeprobability.append(round(p, 4))

del sizeprobability[0]
print(sizeprobability)

[0.0005, 0.0024, 0.0092, 0.0278, 0.0656, 0.121, 0.1747, 0.1974, 0.1747, 0.121, 0.0656, 0.0278, 0.0092, 0.0024]


#### Men shoe size probabilties to dictionary

In [32]:
sizedict = {}
for key in size:
    for prob in sizeprobability:
        sizedict[key] = prob
        sizeprobability.remove(prob)
        break

print(sizedict)

{36: 0.0005, 37: 0.0024, 38: 0.0092, 39: 0.0278, 40: 0.0656, 41: 0.121, 42: 0.1747, 43: 0.1974, 44: 0.1747, 45: 0.121, 46: 0.0656, 47: 0.0278, 48: 0.0092}


#### Women

In [33]:
womensize = []
for i in range(35,46):
    womensize.append(i)
print(womensize)

wsize = []

for i in range(34, 47):
    wsize.append(i)
print(wsize)

[35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]
[34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]


In [34]:
u = 39.00
o = 2.00

Zscores = []
for i in wsize:
    Z = ((i+0.5)-u)/o
    Zscores.append(Z)

print(Zscores)

[-2.25, -1.75, -1.25, -0.75, -0.25, 0.25, 0.75, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75]


In [35]:
pvalues = []

for i in Zscores:
    p = st.norm.cdf(i)
    pvalues.append(p)
    
print(pvalues)

[0.012224472655044696, 0.040059156863817086, 0.10564977366685535, 0.2266273523768682, 0.4012936743170763, 0.5987063256829237, 0.7733726476231317, 0.8943502263331446, 0.9599408431361829, 0.9877755273449553, 0.9970202367649454, 0.9994229749576092, 0.9999115827147992]


In [36]:
wsizeprobability = []
for i in range(len(pvalues)):
    p_upper = pvalues[i]
    p_lower = pvalues[i-1]
    p = p_upper - p_lower
    wsizeprobability.append(round(p, 4))

del wsizeprobability[0]
print(wsizeprobability)

[0.0278, 0.0656, 0.121, 0.1747, 0.1974, 0.1747, 0.121, 0.0656, 0.0278, 0.0092, 0.0024, 0.0005]


In [37]:
wsizedict = {}
for key in womensize:
    for prob in wsizeprobability:
        wsizedict[key] = prob
        wsizeprobability.remove(prob)
        break

print(wsizedict)

{35: 0.0278, 36: 0.0656, 37: 0.121, 38: 0.1747, 39: 0.1974, 40: 0.1747, 41: 0.121, 42: 0.0656, 43: 0.0278, 44: 0.0092, 45: 0.0024}


### Remove women shoes with size > 45

In [61]:
productsdf = productsdf.drop(productsdf[(productsdf.Product_Size > 45) & (productsdf.Product_Name.str.strip().str[-5:] == 'WOMEN')].index)
productsdf = productsdf.drop(productsdf[(productsdf.Product_Size < 39) & (productsdf.Product_Name.str.strip().str[-4:] == '_MEN')].index)


In [39]:
print(productsdf.iloc[:,0])#.strip()[-5:])

0       SWEATZY_MEN
1       SWEATZY_MEN
2       SWEATZY_MEN
3       SWEATZY_MEN
4       SWEATZY_MEN
           ...     
580    SWOONIX_KIDS
581    SWOONIX_KIDS
582    SWOONIX_KIDS
583    SWOONIX_KIDS
584    SWOONIX_KIDS
Name: Product_Name, Length: 1566, dtype: object


#### Kids

In [50]:
kidssizedict = {26:0.069,
               27:0.069,
               28:0.070,
               29:0.071,
               30:0.074,
               31:0.075,
               32:0.076,
               33:0.078,
               34:0.080,
               35:0.082,
               36:0.084,
               37:0.085,
               38:0.087}

### Function to apply probabilities to Women and Men shoes

In [62]:
def sizeprobability(sizep, productname):
    if productname.strip()[-4:] == '_MEN':
        return sizedict[sizep]
    if productname.strip()[-5:] == 'WOMEN':
        return wsizedict[sizep]
    if productname.strip()[-4:] == 'KIDS':
         return kidssizedict[sizep]

In [42]:
kidssizedict[28]


# print(sizeprobability(36, 'FROVE_KIDS'))

0.07

In [43]:
sizeprobability(26, 'FROVE_KIDS')

0.069

In [53]:
productsdf.head()

Unnamed: 0,Product_Name,Product_Color,Product_Size,Product_Price
0,SWEATZY_MEN,Crimson,36,49.95
1,SWEATZY_MEN,Crimson,37,49.95
2,SWEATZY_MEN,Crimson,38,49.95
3,SWEATZY_MEN,Crimson,39,49.95
4,SWEATZY_MEN,Crimson,40,49.95


In [63]:
productsdf['Probability'] = productsdf.apply(lambda x: sizeprobability(x['Product_Size'], x['Product_Name']), axis=1)


In [48]:
kidsproductsdf.describe()

Unnamed: 0,Product_Size
count,585.0
mean,32.0
std,3.744859
min,26.0
25%,29.0
50%,32.0
75%,35.0
max,38.0


In [64]:
productsdf.describe()

Unnamed: 0,Product_Size,Product_Price,Probability
count,1350.0,1350.0,1350.0
mean,38.766667,58.616667,0.090567
std,5.603762,9.396149,0.055464
min,26.0,39.95,0.0024
25%,35.0,49.95,0.0656
50%,39.5,59.95,0.076
75%,43.0,64.95,0.121
max,48.0,74.95,0.1974


In [66]:
display(productsdf)

Unnamed: 0,Product_Name,Product_Color,Product_Size,Product_Price,Probability
4,SWEATZY_MEN,Crimson,39,49.95,0.0278
5,SWEATZY_MEN,Crimson,40,49.95,0.0656
6,SWEATZY_MEN,Crimson,41,49.95,0.1210
7,SWEATZY_MEN,Crimson,42,49.95,0.1747
8,SWEATZY_MEN,Crimson,43,49.95,0.1974
...,...,...,...,...,...
581,SWOONIX_KIDS,Peach,34,54.95,0.0800
582,SWOONIX_KIDS,Peach,35,54.95,0.0820
583,SWOONIX_KIDS,Peach,36,54.95,0.0840
584,SWOONIX_KIDS,Peach,37,54.95,0.0850


### Changing index start from 0 to 1

In [None]:
# productsdf.index += 1

In [67]:
productsdf.to_csv('Products.csv')