In [1]:
import pandas as pd
import math
import random
import zipfile

# Settings

In [2]:
# file containing master data and respective settings
masterdata = 'masterdata.xlsx'

In [3]:
# cover the following years
start_date = '2020-01-01'
end_date = '2036-01-01'   # exclusive

### Seasonality

In [4]:
weeks = pd.read_excel(masterdata, sheet_name="weeks")
weeks["Weight"] = weeks["Weight"].rolling(2, min_periods=1).mean()  # smoothing
weeks["Weight"] = weeks["Weight"] / weeks["Weight"].sum() * len(weeks) # normalize
weeks.set_index(keys="Week", inplace=True)
weeks.sample()

Unnamed: 0_level_0,Weight,Reason,max shipping offset weeks
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
37,0.771429,,30


In [5]:
years = pd.read_excel(masterdata, sheet_name="years")
years.set_index(keys="Year", inplace=True)
years["Growth_pct"] = ((years["Growth"] + 100) / 100).cumprod()
years.sample()

Unnamed: 0_level_0,Growth,Growth_pct
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2027,5,1.080842


### Customers
To find out the available customer master data in S4, use this (remove comments when executing in Transaction DB02).
The related VKORG kan be found in table `KNVV`.
``` sql
SELECT but000.client, but000.partner, but000.name_org1, adrc.country, adrc.city1
FROM but000
JOIN but020 ON but000.client = but020.client AND but000.partner = but020.partner
JOIN adrc ON but020.client = adrc.client AND but020.addrnumber = adrc.addrnumber
JOIN but100 ON but000.client = but100.mandt AND but000.partner = but100.partner
WHERE but000.client = 220    -- partner id depends on the client
AND but000.bu_sort1 = '334'  -- any search term except 000, which also contains dirty student BP
AND but100.rltyp = 'FLCU00'  -- a customer role
ORDER BY but000.name_org1
```

Outside S4, we can only identify a customer by its name (the business partner number in S4 depends on the system and client).

In [6]:
customers = pd.read_excel(masterdata, sheet_name="customers")
customers.set_index("CNAME", inplace=True)
customers["DiscountWeight"] = customers["CWeight"]
customers["CWeight"] = customers["CWeight"] / customers["CWeight"].sum() * len(customers) # normalize
customers.sample()

Unnamed: 0_level_0,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART,DiscountWeight
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ostseerad,DE,Anklam,0.449438,DN00,WH,BI,5


Discounts (C03)

In [7]:
bins = [0, 10, 20, 100]  # np.inf would be the correct upper bound - 100 will do
labels = ['0', '10', '20']
discounts = {'0': 0, '10':3, '20':5}
customers["Discount%"] = pd.cut(customers["DiscountWeight"], bins=bins, labels=labels, right=False).map(discounts).astype(float)
customers.sample()

Unnamed: 0_level_0,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART,DiscountWeight,Discount%
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Peachtree Bikes,US,Atlanta,0.898876,UE00,WH,BI,10,3.0


### Materials

Find out available material master and related org units in S4
``` sql
SELECT mara.mandt, mara.matnr, mara.matkl, mara.mtart, mara.tragr AS "Transportgrp."
    , makt.maktx, mvke.vkorg, mvke.vtweg
    , marc.werks, marc.ladgr AS "Loadinggrp."
    , mbew.vprsv AS "std/avg", mbew.stprs AS "std price", mbew.verpr AS "avg price"  -- material cost
FROM mara
JOIN makt ON mara.mandt = makt.mandt AND mara.matnr = makt.matnr
JOIN mvke ON mara.mandt = mvke.mandt AND mara.matnr = mvke.matnr
JOIN marc ON mvke.mandt = marc.mandt AND mvke.matnr = marc.matnr AND mvke.dwerk = marc.werks
LEFT OUTER JOIN mbew ON marc.mandt = mbew.mandt AND marc.matnr = mbew.matnr AND marc.werks = mbew.bwkey
WHERE mara.mandt = 220              -- client
AND mara.mtart IN ('FERT', 'HAWA')  -- finished and trading goods
AND mara.matnr LIKE '%000%'         -- Global Bike group number
AND makt.spras = 'E'                -- English
AND mvke.vtweg = 'WH'               -- we are only here
ORDER BY mara.matnr, mvke.vkorg, mvke.vtweg
```

In [8]:
materials = pd.read_excel(masterdata, sheet_name="materials")
materials.set_index("MATNR", inplace=True)
materials.sample()

Unnamed: 0_level_0,MATKL,MTART,MAKTX,MWeight
MATNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
OHMT1000,SFTY,HAWA,Off Road Helmet,1.0


Additional weights depending on materials x customers

In [9]:
materials_customers_weights = pd.read_excel(masterdata, sheet_name="customers x materials")
# combine both weight definitions
mc = materials_customers_weights.set_index("CNAME").T
m = materials["MWeight"]
materials_customers_weights = mc.mul(m, axis='index')
# normalize per customer
materials_customers_weights = (materials_customers_weights / materials_customers_weights.sum() * len(materials_customers_weights)).T
materials_customers_weights.sample()

Unnamed: 0_level_0,BOTL1000,CAGE1000,DGRB2000,DGRR2000,DGRW2000,DXTR1000,DXTR2000,DXTR3000,EPAD1000,FAID1000,...,OHMT1000,ORMN1000,ORWN1000,PRTR1000,PRTR2000,PRTR3000,PUMP1000,RHMT1000,RKIT1000,SHRT1000
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Airport Bikes,0.781759,2.345277,0.390879,0.156352,0.390879,0.781759,1.563518,0.781759,0.390879,1.563518,...,0.781759,1.563518,0.781759,0.781759,1.563518,0.781759,3.908795,0.781759,0.781759,0.781759


# Generator

### Working days

Produces subsequent order dates
- only business days (no weekends)
- range is defined above

In [10]:
def gen_date():
    for d in pd.bdate_range(start=start_date, end=end_date, inclusive='left'):
        yield d

The shipping date is the order date plus an offset depending on seasonality (C12).

In [11]:
def shipping_date(order_date: pd.Timestamp):
    return order_date + pd.DateOffset(weeks = random.randint(1, weeks.loc[order_date.week, "max shipping offset weeks"]))

### Customers

Produce subsequent customers for a day
- number of customers drawn = number of orders for that day is 10 at average
- number of orders  tends to grow over time (C11); since this also affects quantities, we only use sqrt() of the effect here
- number of orders varies by seasonality (C01); since seasonality also affects quantities, we only use sqrt() of the effect here
- The S4 system will create its own internal document numbers, but we also need an additional private (external to S4) number.
  This allows us to recognize our orders when implementing the daily delta logic later.
  The order number contains the date of the order plus a subsequential number to make it unique.
- Customers are selected according to their CWeight (C02); this also affects the number of order positions, hence, we only apply sqrt() of the weights.

In [12]:
def year_factor(day: pd.Timestamp):
    f = years.loc[day.year]["Growth_pct"]
    return math.sqrt(f)

def season_factor(day: pd.Timestamp):
    return math.sqrt(weeks.loc[day.week]["Weight"])

def gen_ordernum_with_customer(day: pd.Timestamp):
    # number of orders
    nof_orders = math.ceil(10 * year_factor(day) * season_factor(day))
    # customers
    customers_sample = customers.sample(n=nof_orders, replace=True, weights=customers["CWeight"].apply(math.sqrt))
    gen_customer = (c for (i, c) in customers_sample.iterrows())
    # produce result
    for x in range(nof_orders):
        order_number = f"{day.strftime('%Y-%m-%d')}#{x:03d}"
        yield(order_number, next(gen_customer))

### Materials

- The average nunber of positions is approx. 10
- The number of positions varies by sqrt() of customer weights (C02) - see above
- Materials are selected according to their MWeight (C04, C06); again this also affects the quantity, why we apply sqrt()

In [13]:
def material_factor(customer: str):
    return materials_customers_weights.loc[customer].apply(math.sqrt)

In [14]:
def gen_material(day: pd.Timestamp, customer: str):
    nof_positions = math.ceil(10 * math.sqrt(customers.loc[customer]["CWeight"]))
    materials_sample = materials.sample(n=nof_positions, replace=True, weights=material_factor(customer))
    gen_material = (m for (i, m) in materials_sample.iterrows())
    # return result
    for pos in range(10, nof_positions * 10, 10):  # position numbers 10, 20, 30, ...
        yield(pos, next(gen_material))

### Quantity

- The average quantity is approx. 10
- The quantity depends on the sqrt() of seasonality (C01) - see above
- The quantity depends on the sqrt() material weights (C04, C06) - see above
- The quantity tends to grow over time (C11) - see above

In [15]:
def ret_quantity(day: pd.Timestamp, customer: str, material: str):
    mat_factor = math.sqrt(material_factor(customer).loc[material])
    quantity = math.ceil(10 * mat_factor * year_factor(day) * season_factor(day))
    return (quantity, 'EA')

### Generate
Create list of sales orders by asking the respective oracles for customers, materials...  
(one year with 100_000+ orders takes 4 minutes on my laptop)

In [16]:
order_list = []
for day in gen_date():
    for (order_number, customer) in gen_ordernum_with_customer(day):
        # shipping date is defined per order 
        shipping = shipping_date(day)
        for (position, material) in gen_material(day, str(customer.name)):  
            (quantity, uom) = ret_quantity(day, str(customer.name), str(material.name))
            customer.loc["CNAME"] = customer.name # was the index; restore value
            material.loc["MATNR"] = material.name # was the index; restore value
            o = pd.concat([
                pd.Series(order_number, index=["Ext order id"]),
                pd.Series(position, index=["Position"]),
                pd.Series(day, index=["Order Date"]),
                pd.Series(shipping, index=["Ship Date"]),
                customer.drop(labels=["CWeight"]),
                material.drop(labels=["MWeight"]),
                pd.Series([quantity, uom], index=["Quantity", "UoM"])
                ])
            order_list.append(o)

In [17]:
# convert to DataFrame
orders = pd.concat(order_list, axis='columns').T

In [18]:
orders.sample(3)

Unnamed: 0,Ext order id,Position,Order Date,Ship Date,COUNTRY,CITY,VKORG,VTWEG,SPART,DiscountWeight,Discount%,CNAME,MATKL,MTART,MAKTX,MATNR,Quantity,UoM
197807,2027-11-02#003,10,2027-11-02,2028-02-01,DE,Hamburg,DN00,WH,BI,10,3.0,Alster Cycling,UTIL,HAWA,Air Pump,PUMP1000,21,EA
224167,2028-10-31#000,70,2028-10-31,2029-02-20,US,Palo Alto,UW00,WH,BI,20,5.0,Silicon Valley Bikes,BIKES,FERT,Professional Touring Bike (red),PRTR3000,14,EA
307830,2031-11-11#012,90,2031-11-11,2031-11-18,DE,München,DS00,WH,BI,20,5.0,Bavaria Bikes,UTIL,FERT,GPS-Bike Computer Road 64GB Silver White,GRWL2000,14,EA


In [19]:
orders.shape

(428827, 18)

### Save

In [20]:
orders.columns

Index(['Ext order id', 'Position', 'Order Date', 'Ship Date', 'COUNTRY',
       'CITY', 'VKORG', 'VTWEG', 'SPART', 'DiscountWeight', 'Discount%',
       'CNAME', 'MATKL', 'MTART', 'MAKTX', 'MATNR', 'Quantity', 'UoM'],
      dtype='object')

The data file is made for import with ABAP into S/4:
- no header line
- tabulator as separator
- ABAP date format
- one file per year

In [21]:
columns = ['Ext order id', 'Position', 'Order Date', 'Ship Date', 'VKORG', 'VTWEG', 'SPART', 'CNAME', 'Discount%', 'MATNR', 'Quantity', 'UoM']
for year in orders["Order Date"].dt.year.unique():
    tsv_file = f'{year:04d}GlobalBikeOrders.tsv'
    zip_file = f'../data/{year:04d}GlobalBikeOrders.zip'
    compression_opts = dict(method='zip', archive_name=tsv_file) 
    (
        orders.loc[orders["Order Date"].dt.year == year]
        .to_csv(zip_file, columns=columns, index=False, header=False, sep='\t', date_format='%Y%m%d', compression=compression_opts)
    )
