In [532]:
import json
import numpy as np
import pandas as pd
import lxml

# Import SAP data previously mapped for ZA1

15Dec_D365_orders_ZA1.csv

In [533]:
df5 = pd.read_csv("./data/ZA/15Dec_D365_orders_ZA1.csv", low_memory=False)

In [534]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871832 entries, 0 to 871831
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              871832 non-null  object
 1   Order Number              871832 non-null  int64 
 2   Material_Description      871832 non-null  object
 3   BaseDate                  871832 non-null  object
 4   BaseHour                  871832 non-null  int64 
 5   Cases                     871832 non-null  int64 
 6   Bill I                    867796 non-null  object
 7   Source Channel            871832 non-null  object
 8   Order Category            871832 non-null  object
 9   D365_ItemNo               871832 non-null  object
 10  Material No               871832 non-null  object
 11  MOD                       871832 non-null  int64 
 12  site_id                   871832 non-null  object
 13  WAREHOUSELOCATIONID       871832 non-null  object
 14  D365

# Import Products, Customers and Warehouses for the Legal Entity

In [535]:
LE = 'UG'

In [536]:
if LE == 'NA':
    LE_Products = pd.read_csv("./data/NA/Group1Products_NA1.csv", delimiter="\t")

if LE == 'MZ':
    LE_Products = pd.read_csv("./data/MZ/Group1Products_MZ1.csv", delimiter=",")

if LE == 'UG':
    LE_Products = pd.read_csv("./data/UG/Group1Products_UG1.csv", delimiter=",")
    

In [537]:
LE_Products

Unnamed: 0,D365_ItemNo_LE,Material_Description_LE
0,5141,SCH NOVIDA PINEAPPLE 500ML 12 S/W NP
1,10208,SCH NOVIDA P/APPLE ZERO 350ML 12 S/W NP
2,1271,COKE 500ML 12 S/W NP
3,2062,SPRITE 500ML 20 RB
4,5157,SCH NOVIDA PINEAPPLE BREEZ 300ML 24 RB
...,...,...
76,5465,SCH TONIC 300ML 24 RB
77,7477,RWENZORI WATER 500ML (24 BOX) NP
78,11165,SPRITE ZERO 330ML 12 S/W NP
79,10811,MM REFRESH MANGO 280ML 12 S/W


In [538]:
if LE == 'NA':
    LE_Customers = pd.read_csv("./data/NA/NA1_Customers.csv", delimiter="\t")
    LE_Customers.rename(columns={'Validated Customer Accounts': 'D365_Cust_No_LE'}, inplace=True)

if LE == 'MZ':
    LE_Customers = pd.read_csv("./data/MZ/PERF01_MZ1_Customers.csv", delimiter="\t")    
    LE_Customers.rename(columns={'CustomerAccount': 'D365_Cust_No_LE'}, inplace=True)
    
if LE == 'UG':
    LE_Customers = pd.read_csv("./data/UG/IPT3 UG1- Customers.csv", delimiter=",")    
    LE_Customers.rename(columns={'CustomerAccount': 'D365_Cust_No_LE'}, inplace=True)
    LE_Customers = LE_Customers[['D365_Cust_No_LE']].copy()
    # Name

In [539]:
LE_Customers

Unnamed: 0,D365_Cust_No_LE
0,12015776
1,12015870
2,12016063
3,12016215
4,12016243
...,...
93,12031221
94,12031313
95,12031464
96,12031500


In [540]:
if LE == 'NA':
    LE_Warehouses = pd.read_csv("./data/NA/NA1_Validated  IPT3 Warehouse Export  NA1 Warehouses.csv", delimiter=",")

if LE == 'MZ':
    LE_Warehouses = pd.read_csv("./data/MZ/IPT3 MZ1 PERF01 Warehouse.csv", delimiter=",")

## Get the top n warehouses and filter the dataframe for those

In [541]:
# Group by site_id and count unique Order Numbers
warehouse_order_counts = df5.groupby('site_id')['Order Number'].nunique().reset_index(name='OrderLinesCount')

# Sort by OrderLinesCount in descending order and get the top 9 warehouses
top_warehouses = warehouse_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Warehouses))

# Display the result
print(top_warehouses)

   site_id  OrderLinesCount
6    ZA022             9899
13   ZA032             6010
0    ZA010             4586
2    ZA014             3869
5    ZA020             3099
9    ZA027             2950
1    ZA012             2664
21   ZA052             2536
3    ZA016              983


In [542]:
# Filter the original DataFrame to only include rows where site_id is in the list of top warehouses
top_warehouse_ids = top_warehouses['site_id'].tolist()
df5_W = df5[df5['site_id'].isin(top_warehouse_ids)]

In [543]:
top_warehouses.reset_index(inplace=True, drop=True)

## Get the top n customers and filter the dataframe for those

In [544]:
# Group by D365_Cust_No and count unique Order Numbers
customer_order_counts = df5_W.groupby('D365_Cust_No')['Order Number'].nunique().reset_index(name='OrderLinesCount')

# Sort by OrderLinesCount in descending order and get the top n customers
top_customers = customer_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Customers))

# Display the result
print(top_customers)

      D365_Cust_No  OrderLinesCount
1530      10017885               67
126       10004521               57
135       10004547               56
1532      10017888               52
1528      10017883               38
...            ...              ...
1547      10018169               14
6108      10115883               14
2129      10024016               14
407       10005889               14
6123      10116036               14

[98 rows x 2 columns]


In [545]:
top_customer_ids = top_customers['D365_Cust_No'].tolist()
df5_W_C = df5_W[df5_W['D365_Cust_No'].isin(top_customer_ids)]

In [546]:
top_customers.reset_index(inplace=True, drop=True)

## Get the top n products and filter the dataframe for those

In [547]:
product_order_counts = df5_W_C.groupby('D365_ItemNo')['Order Number'].nunique().reset_index(name='OrderLinesCount')


top_products = product_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Products))


print(top_products)

    D365_ItemNo  OrderLinesCount
49        10207              633
41        10194              447
229   CB0200017              435
56         1040              406
165        6540              373
..          ...              ...
83         1780               39
240   CN0200034               38
244   CV0200034               38
200        7534               38
225        7950               38

[81 rows x 2 columns]


In [548]:
top_product_ids = top_products['D365_ItemNo'].tolist()
df5_W_C_P = df5_W_C[df5_W_C['D365_ItemNo'].isin(top_product_ids)]

In [549]:
top_products.reset_index(inplace=True, drop=True)

## Now replace warehouses, customers and products for the data in the new legal entity
Extend the dataframes produced above, top_warehouses, top_customers and top_products by adding a column (for the same number of rows).  The new column contains the corresponding data for the new legal entity.

### Create Products lookup table

In [550]:
# Create a new column 'Index' in both dataframes with current index values
top_products['Index'] = top_products.index
LE_Products['Index'] = LE_Products.index

In [551]:
if len(LE_Products) != len(top_products):
    raise SystemExit("Mismatch in number of products")

In [552]:
LE_Products

Unnamed: 0,D365_ItemNo_LE,Material_Description_LE,Index
0,5141,SCH NOVIDA PINEAPPLE 500ML 12 S/W NP,0
1,10208,SCH NOVIDA P/APPLE ZERO 350ML 12 S/W NP,1
2,1271,COKE 500ML 12 S/W NP,2
3,2062,SPRITE 500ML 20 RB,3
4,5157,SCH NOVIDA PINEAPPLE BREEZ 300ML 24 RB,4
...,...,...,...
76,5465,SCH TONIC 300ML 24 RB,76
77,7477,RWENZORI WATER 500ML (24 BOX) NP,77
78,11165,SPRITE ZERO 330ML 12 S/W NP,78
79,10811,MM REFRESH MANGO 280ML 12 S/W,79


In [553]:
# Merge based on Index
top_products = pd.merge(
    top_products,
    LE_Products,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [554]:
top_products.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

In [555]:
top_products

Unnamed: 0,D365_ItemNo,D365_ItemNo_LE,Material_Description_LE
0,10207,5141,SCH NOVIDA PINEAPPLE 500ML 12 S/W NP
1,10194,10208,SCH NOVIDA P/APPLE ZERO 350ML 12 S/W NP
2,CB0200017,1271,COKE 500ML 12 S/W NP
3,1040,2062,SPRITE 500ML 20 RB
4,6540,5157,SCH NOVIDA PINEAPPLE BREEZ 300ML 24 RB
...,...,...,...
76,1780,5465,SCH TONIC 300ML 24 RB
77,CN0200034,7477,RWENZORI WATER 500ML (24 BOX) NP
78,CV0200034,11165,SPRITE ZERO 330ML 12 S/W NP
79,7534,10811,MM REFRESH MANGO 280ML 12 S/W


### Create Customers lookup table

In [556]:
# Create a new column 'Index' in both dataframes with current index values
top_customers['Index'] = top_customers.index
LE_Customers['Index'] = LE_Customers.index

In [557]:
if len(LE_Customers) != len(top_customers):
    raise SystemExit("Mismatch in number of customers")

In [558]:
LE_Customers = LE_Customers.drop_duplicates().copy()

In [559]:
# Merge based on Index
top_customers = pd.merge(
    top_customers,
    LE_Customers,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [560]:
top_customers.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

### Create Warehouses lookup table

In [561]:
LE_Warehouses = LE_Warehouses[['WAREHOUSEID']].copy()

KeyError: "None of [Index(['WAREHOUSEID'], dtype='object')] are in the [columns]"

In [None]:
LE_Warehouses.rename(columns={'WAREHOUSEID': 'site_id_LE'}, inplace=True)

In [None]:
# Create a new column 'Index' in both dataframes with current index values
top_warehouses['Index'] = top_warehouses.index
LE_Warehouses['Index'] = LE_Warehouses.index

In [None]:
if len(LE_Warehouses) != len(top_warehouses):
    raise SystemExit("Mismatch in number of warehouses")

In [None]:
# Merge based on Index
top_warehouses = pd.merge(
    top_warehouses,
    LE_Warehouses,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [None]:
top_warehouses.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

In [None]:
top_warehouses

Unnamed: 0,site_id,site_id_LE
0,ZA022,NA010B
1,ZA032,NA011B
2,ZA010,NA011Q
3,ZA014,NA012B
4,ZA020,NA013B
5,ZA027,NA013Q
6,ZA012,NA014B
7,ZA052,NA015B
8,ZA016,NA016B


### Merge Warehouses lookup table with SAP data

In [None]:
df5_1 = pd.merge(
    df5_W_C_P,
    top_warehouses,
    how="left",
    on="site_id",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [None]:
df5_W_C_P.info()

<class 'pandas.core.frame.DataFrame'>
Index: 117843 entries, 919 to 871548
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              117843 non-null  object
 1   Order Number              117843 non-null  int64 
 2   Material_Description      117843 non-null  object
 3   BaseDate                  117843 non-null  object
 4   BaseHour                  117843 non-null  int64 
 5   Cases                     117843 non-null  int64 
 6   Bill I                    117603 non-null  object
 7   Source Channel            117843 non-null  object
 8   Order Category            117843 non-null  object
 9   D365_ItemNo               117843 non-null  object
 10  Material No               117843 non-null  object
 11  MOD                       117843 non-null  int64 
 12  site_id                   117843 non-null  object
 13  WAREHOUSELOCATIONID       117843 non-null  object
 14  D365_De

In [None]:
df5_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117843 entries, 0 to 117842
Data columns (total 19 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              117843 non-null  object
 1   Order Number              117843 non-null  int64 
 2   Material_Description      117843 non-null  object
 3   BaseDate                  117843 non-null  object
 4   BaseHour                  117843 non-null  int64 
 5   Cases                     117843 non-null  int64 
 6   Bill I                    117603 non-null  object
 7   Source Channel            117843 non-null  object
 8   Order Category            117843 non-null  object
 9   D365_ItemNo               117843 non-null  object
 10  Material No               117843 non-null  object
 11  MOD                       117843 non-null  int64 
 12  site_id                   117843 non-null  object
 13  WAREHOUSELOCATIONID       117843 non-null  object
 14  D365

### Merge Products lookup table with SAP data

In [None]:
df5_2 = pd.merge(
    df5_1,
    top_products,
    how="left",
    on="D365_ItemNo",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [None]:
df5_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117843 entries, 0 to 117842
Data columns (total 21 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              117843 non-null  object
 1   Order Number              117843 non-null  int64 
 2   Material_Description      117843 non-null  object
 3   BaseDate                  117843 non-null  object
 4   BaseHour                  117843 non-null  int64 
 5   Cases                     117843 non-null  int64 
 6   Bill I                    117603 non-null  object
 7   Source Channel            117843 non-null  object
 8   Order Category            117843 non-null  object
 9   D365_ItemNo               117843 non-null  object
 10  Material No               117843 non-null  object
 11  MOD                       117843 non-null  int64 
 12  site_id                   117843 non-null  object
 13  WAREHOUSELOCATIONID       117843 non-null  object
 14  D365

### Merge Customers lookup table with SAP data

In [None]:
df5_3 = pd.merge(
    df5_2,
    top_customers,
    how="left",
    on="D365_Cust_No",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [None]:
df5_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117843 entries, 0 to 117842
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              117843 non-null  object
 1   Order Number              117843 non-null  int64 
 2   Material_Description      117843 non-null  object
 3   BaseDate                  117843 non-null  object
 4   BaseHour                  117843 non-null  int64 
 5   Cases                     117843 non-null  int64 
 6   Bill I                    117603 non-null  object
 7   Source Channel            117843 non-null  object
 8   Order Category            117843 non-null  object
 9   D365_ItemNo               117843 non-null  object
 10  Material No               117843 non-null  object
 11  MOD                       117843 non-null  int64 
 12  site_id                   117843 non-null  object
 13  WAREHOUSELOCATIONID       117843 non-null  object
 14  D365

In [None]:
df5_3.drop(columns={'site_id', 'D365_Cust_No', 'Material_Description', 'D365_ItemNo'}, inplace=True, axis=1)

In [None]:
df5_3.rename(columns={'site_id_LE': 'site_id', 'D365_Cust_No_LE': 'D365_Cust_No', 'Material_Description_LE': 'Material_Description', 'D365_ItemNo_LE': 'D365_ItemNo'}, inplace=True)

In [None]:
path = './data/'+ LE + '/15Dec_D365_orders_' + LE + '.feather'
df5_3.to_feather(path)