In [129]:
import pandas as pd
import math
import zipfile

# Settings

In [130]:
# file containing master data and respective settings
masterdata = 'masterdata.xlsx'

In [131]:
# cover the following years
start_date = '2020-01-01'
end_date = '2036-01-01'

start_date = '2010-01-01' # test
end_date = '2011-02-01'   # test

### Seasonality

In [132]:
weeks = pd.read_excel(masterdata, sheet_name="weeks")
weeks["Weight"] = weeks["Weight"].rolling(2, min_periods=1).mean()  # smoothing
weeks["Weight"] = weeks["Weight"] / weeks["Weight"].sum() * len(weeks) # normalize
weeks.set_index(keys="Week", inplace=True)
weeks.sample()

Unnamed: 0_level_0,Weight,Reason
Week,Unnamed: 1_level_1,Unnamed: 2_level_1
6,1.735714,


In [133]:
years = pd.read_excel(masterdata, sheet_name="years")
years.set_index(keys="Year", inplace=True)
years["Growth_pct"] = ((years["Growth"] + 100) / 100).cumprod()
years.sample()

Unnamed: 0_level_0,Growth,Growth_pct
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018,0,1.0


### Customers
To find out the available customers and master data in S4, use this (remove comments when executing in Transaction DB02)
``` sql
SELECT but000.client, but000.partner, but000.name_org1, adrc.country, adrc.city1
FROM but000
JOIN but020 ON but000.client = but020.client AND but000.partner = but020.partner
JOIN adrc ON but020.client = adrc.client AND but020.addrnumber = adrc.addrnumber
JOIN but100 ON but000.client = but100.mandt AND but000.partner = but100.partner
WHERE but000.client = 202    -- partner id depends on the client
AND but000.bu_sort1 = '334'  -- any search term except 000, which also contains dirty student BP
AND but100.rltyp = 'FLCU00'  -- a customer role
ORDER BY but000.name_org1
```

Outside S4, we can only identify a customer by its name (the business partner number in S4 depends on the system and client).

In [134]:
customers = pd.read_excel(masterdata, sheet_name="customers")
customers.set_index("CNAME", inplace=True)
customers["DiscountWeight"] = customers["CWeight"]
customers["CWeight"] = customers["CWeight"] / customers["CWeight"].sum() * len(customers) # normalize
customers.sample()

Unnamed: 0_level_0,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART,DiscountWeight
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ostseerad,DE,Anklam,0.48583,DN00,WH,BI,5


Discounts (C03)

In [135]:
bins = [0, 10, 20, 100]  # np.inf would be the correct upper bound - 100 will do
labels = ['0', '10', '20']
discounts = {'0': 0, '10':3, '20':5}
customers["Discount%"] = pd.cut(customers["DiscountWeight"], bins=bins, labels=labels, right=False).map(discounts).astype(float)
customers.sample()

Unnamed: 0_level_0,COUNTRY,CITY,CWeight,VKORG,VTWEG,SPART,DiscountWeight,Discount%
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alster Cycling,DE,Hamburg,0.97166,DN00,WH,BI,10,3.0


### Materials

Find out available material master in S4
``` sql
SELECT mara.mandt, mara.matnr, mara.matkl, mara.mtart, makt.maktx
FROM mara
JOIN makt ON mara.mandt = makt.mandt AND mara.matnr = makt.matnr
WHERE mara.mandt = 202              -- client
AND mara.mtart IN ('FERT', 'HAWA')  -- finished and trading goods
AND mara.matnr LIKE '%888%'         -- any group
AND makt.spras = 'E'                -- English
ORDER BY mara.matnr
```

In [136]:
materials = pd.read_excel(masterdata, sheet_name="materials")
materials.set_index("MATNR", inplace=True)
materials.sample()

Unnamed: 0_level_0,MATKL,MTART,MAKTX,MWeight
MATNR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DXTR2000,BIKES,FERT,Deluxe Touring Bike (silver),2.0


Additional weights depending on materials x customers

In [137]:
materials_customers_weights = pd.read_excel(masterdata, sheet_name="customers x materials")
# combine both weight definitions
mc = materials_customers_weights.set_index("CNAME").T
m = materials["MWeight"]
materials_customers_weights = mc.mul(m, axis='index')
# normalize per customer
materials_customers_weights = (materials_customers_weights / materials_customers_weights.sum() * len(materials_customers_weights)).T
materials_customers_weights.sample()

Unnamed: 0_level_0,BOTL1000,CAGE1000,DGRB2000,DGRR2000,DGRW2000,DXTR1000,DXTR2000,DXTR3000,EPAD1000,FAID1000,...,ORBC1000,ORMN1000,ORWN1000,PRTR1000,PRTR2000,PRTR3000,PUMP1000,RHMT1000,RKIT1000,SHRT1000
CNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SoCal Bikes,0.788644,2.365931,0.394322,0.157729,0.394322,0.788644,1.577287,0.788644,0.394322,1.577287,...,0.788644,1.577287,0.788644,0.788644,1.577287,0.788644,3.943218,0.788644,0.788644,0.788644


# Generator

### Working days

Produces subsequent dates
- only business days (no weekends)
- range is defined above

In [138]:
def gen_date():
    for d in pd.bdate_range(start=start_date, end=end_date, inclusive='left'):
        yield d

### Customers

Produce subsequent customers for a day
- number of customers drawn = number of orders for that day is 10 at average
- number of orders  tends to grow over time (C11); since this also affects quantities, we only use sqrt() of the effect here
- number of orders varies by seasonality (C01); since seasonality also affects quantities, we only use sqrt() of the effect here
- The S4 system will create its own internal document numbers, but we also need an additional private (external to S4) number.
  This allows us to recognize our orders when implementing the daily delta logic later.
  The order number contains the date of the order plus a subsequential number to make it unique.
- Customers are selected according to their CWeight (C02); this also affects the number of order positions, hence, we only apply sqrt() of the weights.

In [139]:
def year_factor(day: pd.Timestamp):
    f = years.loc[day.year]["Growth_pct"]
    return math.sqrt(f)

def season_factor(day: pd.Timestamp):
    return math.sqrt(weeks.loc[day.week]["Weight"])

def gen_ordernum_with_customer(day: pd.Timestamp):
    # number of orders
    nof_orders = math.ceil(10 * year_factor(day) * season_factor(day))
    # customers
    customers_sample = customers.sample(n=nof_orders, replace=True, weights=customers["CWeight"].apply(math.sqrt))
    gen_customer = (c for (i, c) in customers_sample.iterrows())
    # produce result
    for x in range(nof_orders):
        order_number = f"{day.strftime('%Y-%m-%d')}#{x:03d}"
        yield(order_number, next(gen_customer))

### Materials

- The average nunber of positions is approx. 10
- The number of positions varies by sqrt() of customer weights (C02) - see above
- Materials are selected according to their MWeight (C04, C06); again this also affects the quantity, why we apply sqrt()

In [140]:
def material_factor(customer: str):
    return materials_customers_weights.loc[customer].apply(math.sqrt)

In [141]:
def gen_material(day: pd.Timestamp, customer: str):
    nof_positions = math.ceil(10 * math.sqrt(customers.loc[customer]["CWeight"]))
    materials_sample = materials.sample(n=nof_positions, replace=True, weights=material_factor(customer))
    gen_material = (m for (i, m) in materials_sample.iterrows())
    # return result
    for pos in range(10, nof_positions * 10, 10):  # position numbers 10, 20, 30, ...
        yield(pos, next(gen_material))

### Quantity

- The average quantity is approx. 10
- The quantity depends on the sqrt() of seasonality (C01) - see above
- The quantity depends on the sqrt() material weights (C04, C06) - see above
- The quantity tends to grow over time (C11) - see above

In [142]:
def ret_quantity(day: pd.Timestamp, customer: str, material: str):
    mat_factor = math.sqrt(material_factor(customer).loc[material])
    quantity = math.ceil(10 * mat_factor * year_factor(day) * season_factor(day))
    return (quantity, 'EA')

### Generate
Create list of sales orders by asking the respective oracles for customers, materials...  
(one year with 100_000+ orders takes 4 minutes on my laptop)

In [143]:
order_list = []
for day in gen_date():
    for (order_number, customer) in gen_ordernum_with_customer(day):
        for (position, material) in gen_material(day, str(customer.name)):  
            (quantity, uom) = ret_quantity(day, str(customer.name), str(material.name))
            customer.loc["CNAME"] = customer.name # was the index; restore value
            material.loc["MATNR"] = material.name # was the index; restore value
            o = pd.concat([
                pd.Series(order_number, index=["Ext order id"]),
                pd.Series(position, index=["Position"]),
                pd.Series(day, index=["Day"]),
                customer.drop(labels=["CWeight"]),
                material.drop(labels=["MWeight"]),
                pd.Series([quantity, uom], index=["Quantity", "UoM"])
                ])
            order_list.append(o)

In [144]:
# convert to DataFrame
orders = pd.concat(order_list, axis='columns').T

In [145]:
orders.sample(3)

Unnamed: 0,Ext order id,Position,Day,COUNTRY,CITY,VKORG,VTWEG,SPART,DiscountWeight,Discount%,CNAME,MATKL,MTART,MAKTX,MATNR,Quantity,UoM
9946,2010-05-20#003,30,2010-05-20,DE,Bochum,DN00,WH,BI,8,0.0,FahrPott,UTIL,HAWA,Water Bottle Cage,CAGE1000,8,EA
17182,2010-09-30#013,80,2010-09-30,US,Chicago,UE00,WH,BI,10,3.0,Windy City Bikes,UTIL,HAWA,Water Bottle Cage,CAGE1000,22,EA
26219,2011-01-18#009,60,2011-01-18,US,Irvine,UW00,WH,BI,10,3.0,SoCal Bikes,BIKES,FERT,Professional Touring Bike (black),PRTR1000,12,EA


In [146]:
orders.shape

(27501, 17)

### Save

In [147]:
orders.columns

Index(['Ext order id', 'Position', 'Day', 'COUNTRY', 'CITY', 'VKORG', 'VTWEG',
       'SPART', 'DiscountWeight', 'Discount%', 'CNAME', 'MATKL', 'MTART',
       'MAKTX', 'MATNR', 'Quantity', 'UoM'],
      dtype='object')

The data file is made for import with ABAP into S/4:
- no header line
- tabulator as separator
- ABAP date format
- one file per year

In [153]:
columns = ['Ext order id', 'Position', 'Day', 'VKORG', 'VTWEG', 'SPART', 'CNAME', 'Discount%', 'MATNR', 'Quantity', 'UoM']
#with zipfile.ZipFile("../data/GlobalBikeOrders.zip", "w", compression=zipfile.ZIP_STORED) as zf:
for year in orders["Day"].dt.year.unique():
        tsv_file = f'{year:04d}GlobalBikeOrders.tsv'
        zip_file = f'../data/{year:04d}GlobalBikeOrders.zip'
        compression_opts = dict(method='zip', archive_name=tsv_file) 
        (
            orders.loc[orders["Day"].dt.year == year]
            .to_csv(zip_file, columns=columns, index=False, header=False, sep='\t', date_format='%Y%m%d', compression=compression_opts)
        )
