In [1]:
import pandas as pd
import math
import zipfile
%xmode minimal

Exception reporting mode: Minimal


# Settings

In [2]:
# file containing master data and respective settings
masterdata = 'masterdata.xlsx'

In [3]:
# cover the following years
start_date = '2020-01-01'
end_date = '2036-01-01'

start_date = '2020-12-11' # test
end_date = '2021-01-06'   # test

### Seasonality

In [4]:
weeks = pd.read_excel(masterdata, sheet_name="weeks")
weeks["Weight"] = weeks["Weight"].rolling(2, min_periods=1).mean()  # smoothing
weeks["Weight_normalized"] = weeks["Weight"] / weeks["Weight"].sum() * len(weeks)
weeks.set_index(keys="Week", inplace=True)
weeks.sample()

Unnamed: 0_level_0,Weight,Reason,Weight_normalized
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
31,1.0,,0.385714


In [5]:
years = pd.read_excel(masterdata, sheet_name="years")
years.sample()

Unnamed: 0,Year,Growth
9,2029,4


### Customers
To find out the available customers and master data in S4, use this (remove comments when executing in Transaction DB02)
``` sql
SELECT but000.client, but000.partner, but000.name_org1, adrc.country, adrc.city1
FROM but000
JOIN but020 ON but000.client = but020.client AND but000.partner = but020.partner
JOIN adrc ON but020.client = adrc.client AND but020.addrnumber = adrc.addrnumber
JOIN but100 ON but000.client = but100.mandt AND but000.partner = but100.partner
WHERE but000.client = 202    -- partner id depends on the client
AND but000.bu_sort1 = '334'  -- any search term except 000, which also contains dirty student BP
AND but100.rltyp = 'FLCU00'  -- a customer role
ORDER BY but000.name_org1
```

Outside S4, we can only identify a customer by its name (the business partner number in S4 depends on the system and client).

In [6]:
customers = pd.read_excel(masterdata, sheet_name="customers")
customers.sample()

Unnamed: 0,CNAME,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART
8,Drahtesel,DE,Leipzig,8,DN00,WH,BI


Discounts (C03)

In [7]:
bins = [0, 10, 20, 100]  # np.inf would be the correct upper bound - 100 will do
labels = ['0', '10', '20']
discounts = {'0': 0, '10':0.03, '20':0.05}
customers["Discount%"] = pd.cut(customers["CWeight"], bins=bins, labels=labels, right=False).map(discounts).astype(float)
customers.sample()

Unnamed: 0,CNAME,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART,Discount%
5,Capital Bikes,DE,Berlin,15,DN00,WH,BI,0.03


### Materials

Find out available material master in S4
``` sql
SELECT mara.mandt, mara.matnr, mara.matkl, mara.mtart, makt.maktx
FROM mara
JOIN makt ON mara.mandt = makt.mandt AND mara.matnr = makt.matnr
WHERE mara.mandt = 202              -- client
AND mara.mtart IN ('FERT', 'HAWA')  -- finished and trading goods
AND mara.matnr LIKE '%888%'         -- any group
AND makt.spras = 'E'                -- English
ORDER BY mara.matnr
```

In [8]:
materials = pd.read_excel(masterdata, sheet_name="materials")
materials.sample()

Unnamed: 0,MATNR,MATKL,MTART,MAKTX,MWeight
0,BOTL1000,UTIL,HAWA,Water Bottle,1.0


# Generator

### Working days

Produces subsequent dates
- only business days (no weekends)
- range is defined above

In [9]:
def gen_date():
    for d in pd.bdate_range(start=start_date, end=end_date, inclusive='left'):
        yield d

### Customers

Produce subsequent customers for a day
- number of customers = number of orders for that day is 50 at average
- number of orders varies by seasonality (C01); since seasonality also affects quantities, we only use sqrt() of the effect here
- The S4 system will create its own internal document numbers, but we also need an additional private (external to S4) number.
  This allows us to recognize our orders when implementing the daily delta logic later.
  The order number contains the date of the order plus a subsequential number to make it unique.
- Customers are selected according to their CWeight (C02); this also affects the number of order positions, hence, we only apply sqrt() of the weights.

In [10]:
def season_factor(day: pd.Timestamp):
    return math.sqrt(weeks.loc[day.week]["Weight_normalized"])

def gen_ordernum_with_customer(day: pd.Timestamp):
    # number of orders
    nof_orders = math.ceil(50 * season_factor(day))
    # customers
    customers_sample = customers.sample(n=nof_orders, replace=True, weights=customers["CWeight"].apply(math.sqrt))
    gen_customer = (c for (i, c) in customers_sample.iterrows())
    # produce result
    for x in range(nof_orders):
        order_number = f"{day.strftime('%Y-%m-%d')}#{x:03d}"
        yield(order_number, next(gen_customer))

### Materials

- The average nunber of positions is approx. 10
- The number of positions varies by sqrt() of customer weights (C02) - see above
- Materials are selected according to their MWeight (C04); again this also affects the quantity, why we apply sqrt()

In [11]:
def gen_material(day, customer):
    # positions
    factor = math.sqrt(customer["CWeight"] / customers["CWeight"].sum() * len(customers))
    nof_positions = math.ceil(10 * factor)
    # materials
    materials_sample = materials.sample(n=nof_positions, replace=True, weights=materials["MWeight"].apply(math.sqrt))
    gen_material = (m for (i, m) in materials_sample.iterrows())
    # produce result
    for pos in range(10, nof_positions * 10, 10):  # position numbers 10, 20, 30, ...
        yield(pos, next(gen_material))

### Quantity

- The average quantity is approx. 10
- The quantity depends on the sqrt() of seasonality (C01) - see above
- The quantity depends on the sqrt() material weights (C04) - see above
- The quantity tends to grow over time (C07)

In [12]:
def ret_quantity(day, material):
    factor = math.sqrt(material["MWeight"] / materials["MWeight"].sum() * len(materials))
    growth = years["Growth"].cumsum().loc[years["Year"] == day.year]
    quantity = math.ceil(10 * season_factor(day) * factor * growth)
    return (quantity, 'ST')

### Generate
Create list of sales orders by asking the respective oracles for customers, materials...  
(one year with 100_000+ orders takes 4 minutes on my laptop)

In [13]:
order_list = []
for day in gen_date():
    for (order_number, customer) in gen_ordernum_with_customer(day):
        for (position, material) in gen_material(day, customer):
            (quantity, uom) = ret_quantity(day, material)
            o = pd.concat([
                pd.Series(order_number, index=["Ext order id"]),
                pd.Series(position, index=["Position"]),
                pd.Series(day, index=["Day"]),
                customer.drop(labels=["CWeight"]),
                material.drop(labels=["MWeight"]),
                pd.Series([quantity, uom], index=["Quantity", "UoM"])
                ])
            order_list.append(o)

In [14]:
# convert to DataFrame
orders = pd.concat(order_list, axis='columns').T

In [15]:
orders.sample(3)

Unnamed: 0,Ext order id,Position,Day,CNAME,COUNTRY,CITY,VKORG,VTWEG,SPART,Discount%,MATNR,MATKL,MTART,MAKTX,Quantity,UoM
658,2020-12-14#026,10,2020-12-14,Alster Cycling,DE,Hamburg,DN00,WH,BI,0.03,PRTR3000,BIKES,FERT,Professional Touring Bike (red),7,ST
4032,2020-12-28#018,10,2020-12-28,FahrPott,DE,Bochum,DN00,WH,BI,0.0,RHMT1000,SFTY,HAWA,Road Helmet,4,ST
5675,2021-01-05#032,90,2021-01-05,Philly Bikes,US,Philadelphia,UE00,WH,BI,0.03,GRBL2000,UTIL,FERT,GPS-Bike Computer Road 64GB Royal Blue,8,ST


In [16]:
orders.shape

(5793, 16)

### Save

In [17]:
orders.columns

Index(['Ext order id', 'Position', 'Day', 'CNAME', 'COUNTRY', 'CITY', 'VKORG',
       'VTWEG', 'SPART', 'Discount%', 'MATNR', 'MATKL', 'MTART', 'MAKTX',
       'Quantity', 'UoM'],
      dtype='object')

The data file is made for import with ABAP into S/4:
- no header line
- tabulator as separator
- ABAP date format
- one file per year

In [18]:
columns = ['Ext order id', 'Position', 'Day', 'VKORG', 'VTWEG', 'SPART', 'CNAME', 'Discount%', 'MATNR', 'Quantity', 'UoM']
with zipfile.ZipFile("../data/GlobalBikeABAP.zip", "w") as zf:
    for year in orders["Day"].dt.year.unique():
        file = f'{year:04d}GlobalBikeABAP.tsv'
        with zf.open(file, "w") as buffer:
            (
                orders.loc[orders["Day"].dt.year == year]
                .to_csv(buffer, columns=columns, index=False, header=False, sep='\t', date_format='%Y%m%d')
            )