In [359]:
import json
import numpy as np
import pandas as pd
import lxml

# Import SAP data previously mapped for ZA1

15Dec_D365_orders_ZA1.csv

In [404]:
df5 = pd.read_csv("./data/ZA/15Dec_D365_orders_ZA1.csv", low_memory=False)

In [405]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871832 entries, 0 to 871831
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ActivityType              871832 non-null  object
 1   Order Number              871832 non-null  int64 
 2   Material_Description      871832 non-null  object
 3   BaseDate                  871832 non-null  object
 4   BaseHour                  871832 non-null  int64 
 5   Cases                     871832 non-null  int64 
 6   Bill I                    867796 non-null  object
 7   Source Channel            871832 non-null  object
 8   Order Category            871832 non-null  object
 9   D365_ItemNo               871832 non-null  object
 10  Material No               871832 non-null  object
 11  MOD                       871832 non-null  int64 
 12  site_id                   871832 non-null  object
 13  WAREHOUSELOCATIONID       871832 non-null  object
 14  D365

# Import Products, Customers and Warehouses for the Legal Entity

In [437]:
LE = 'MZ'

In [438]:
if LE == 'NA':
    LE_Products = pd.read_csv("./data/NA/Group1Products_NA1.csv", delimiter="\t")

if LE == 'MZ':
    LE_Products = pd.read_csv("./data/MZ/Group1Products_MZ1.csv", delimiter=",")

if LE == 'UG':
    LE_Products = pd.read_csv("./data/UG/Group1Products_UG1.csv", delimiter=",")
    

In [439]:
LE_Products

Unnamed: 0,D365_ItemNo_LE,Material_Description_LE
0,10491,FANTA CASHEW 330ML 4X6C
1,2225,SPAR MORANGO 1L 6 S/W NP
2,4996,MONSTER ULTRA ZERO 06X04 500C
3,6032,SPAR C/SODA 330ML 4X6C
4,10805,CAPPY NEC EXOT 06X04 330ML EXP
...,...,...
107,10382,PREDATOR PURPLE 06X04 500ML CAN
108,2249,SPAR MORANGO 330ML 4X6C
109,3240,FANTA GRAPE 01X06 2000 SHR
110,3841,MAZOE BLACKBERRY 2L 6 PET


In [408]:
if LE == 'NA':
    LE_Customers = pd.read_csv("./data/NA/NA1_Customers.csv", delimiter="\t")
    LE_Customers.rename(columns={'Validated Customer Accounts': 'D365_Cust_No_LE'}, inplace=True)

if LE == 'MZ':
    LE_Customers = pd.read_csv("./data/MZ/PERF01_MZ1_Customers.csv", delimiter="\t")    
    LE_Customers.rename(columns={'CustomerAccount': 'D365_Cust_No_LE'}, inplace=True)
    
if LE == 'UG':
    LE_Customers = pd.read_csv("./data/UG/IPT3 UG1- Customers.csv", delimiter=",")    
    LE_Customers.rename(columns={'CustomerAccount': 'D365_Cust_No_LE'}, inplace=True)
    LE_Customers = LE_Customers[['D365_Cust_No_LE']].copy()
    # Name

In [409]:
LE_Customers

Unnamed: 0,D365_Cust_No_LE
0,12015776
1,12015870
2,12016063
3,12016215
4,12016243
...,...
93,12031221
94,12031313
95,12031464
96,12031500


In [366]:
if LE == 'NA':
    LE_Warehouses = pd.read_csv("./data/NA/NA1_Validated  IPT3 Warehouse Export  NA1 Warehouses.csv", delimiter=",")

if LE == 'MZ':
    LE_Warehouses = pd.read_csv("./data/MZ/IPT3 MZ1 PERF01 Warehouse.csv", delimiter=",")

## Get the top n warehouses and filter the dataframe for those

In [367]:
# Group by site_id and count unique Order Numbers
warehouse_order_counts = df5.groupby('site_id')['Order Number'].nunique().reset_index(name='OrderLinesCount')

# Sort by OrderLinesCount in descending order and get the top 9 warehouses
top_warehouses = warehouse_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Warehouses))

# Display the result
print(top_warehouses)

   site_id  OrderLinesCount
6    ZA022             9899
13   ZA032             6010
0    ZA010             4586
2    ZA014             3869
5    ZA020             3099
9    ZA027             2950
1    ZA012             2664


In [368]:
# Filter the original DataFrame to only include rows where site_id is in the list of top warehouses
top_warehouse_ids = top_warehouses['site_id'].tolist()
df5_W = df5[df5['site_id'].isin(top_warehouse_ids)]

In [369]:
top_warehouses.reset_index(inplace=True, drop=True)

## Get the top n customers and filter the dataframe for those

In [370]:
# Group by D365_Cust_No and count unique Order Numbers
customer_order_counts = df5_W.groupby('D365_Cust_No')['Order Number'].nunique().reset_index(name='OrderLinesCount')

# Sort by OrderLinesCount in descending order and get the top n customers
top_customers = customer_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Customers))

# Display the result
print(top_customers)

      D365_Cust_No  OrderLinesCount
1486      10017885               61
126       10004521               57
135       10004547               56
1488      10017888               52
2487      10027528               35
...            ...              ...
3747      10040508               11
4545      10048499               11
6385      10122363               11
5926      10113761               11
3364      10038149               11

[170 rows x 2 columns]


In [371]:
top_customer_ids = top_customers['D365_Cust_No'].tolist()
df5_W_C = df5_W[df5_W['D365_Cust_No'].isin(top_customer_ids)]

In [372]:
top_customers.reset_index(inplace=True, drop=True)

## Get the top n products and filter the dataframe for those

In [373]:
product_order_counts = df5_W_C.groupby('D365_ItemNo')['Order Number'].nunique().reset_index(name='OrderLinesCount')


top_products = product_order_counts.sort_values(by='OrderLinesCount', ascending=False).head(len(LE_Products))


print(top_products)

    D365_ItemNo  OrderLinesCount
69        10207              797
271   CB0200017              637
78         1040              559
61        10194              536
200        6540              504
..          ...              ...
31         0947               30
77        10382               29
203        6564               28
118        2518               28
58        10191               28

[112 rows x 2 columns]


In [374]:
top_product_ids = top_products['D365_ItemNo'].tolist()
df5_W_C_P = df5_W_C[df5_W_C['D365_ItemNo'].isin(top_product_ids)]

In [375]:
top_products.reset_index(inplace=True, drop=True)

## Now replace warehouses, customers and products for the data in the new legal entity
Extend the dataframes produced above, top_warehouses, top_customers and top_products by adding a column (for the same number of rows).  The new column contains the corresponding data for the new legal entity.

### Create Products lookup table

In [376]:
# Create a new column 'Index' in both dataframes with current index values
top_products['Index'] = top_products.index
LE_Products['Index'] = LE_Products.index

In [377]:
if len(LE_Products) != len(top_products):
    raise SystemExit("Mismatch in number of products")

In [378]:
LE_Products

Unnamed: 0,D365_ItemNo_LE,Material_Description_LE,Index
0,10491,FANTA CASHEW 330ML 4X6C,0
1,2225,SPAR MORANGO 1L 6 S/W NP,1
2,4996,MONSTER ULTRA ZERO 06X04 500C,2
3,6032,SPAR C/SODA 330ML 4X6C,3
4,10805,CAPPY NEC EXOT 06X04 330ML EXP,4
...,...,...,...
107,10382,PREDATOR PURPLE 06X04 500ML CAN,107
108,2249,SPAR MORANGO 330ML 4X6C,108
109,3240,FANTA GRAPE 01X06 2000 SHR,109
110,3841,MAZOE BLACKBERRY 2L 6 PET,110


In [379]:
# Merge based on Index
top_products = pd.merge(
    top_products,
    LE_Products,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [380]:
top_products.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

In [381]:
top_products

Unnamed: 0,D365_ItemNo,D365_ItemNo_LE,Material_Description_LE
0,10207,10491,FANTA CASHEW 330ML 4X6C
1,CB0200017,2225,SPAR MORANGO 1L 6 S/W NP
2,1040,4996,MONSTER ULTRA ZERO 06X04 500C
3,10194,6032,SPAR C/SODA 330ML 4X6C
4,6540,10805,CAPPY NEC EXOT 06X04 330ML EXP
...,...,...,...
107,0947,10382,PREDATOR PURPLE 06X04 500ML CAN
108,10382,2249,SPAR MORANGO 330ML 4X6C
109,6564,3240,FANTA GRAPE 01X06 2000 SHR
110,2518,3841,MAZOE BLACKBERRY 2L 6 PET


### Create Customers lookup table

In [382]:
# Create a new column 'Index' in both dataframes with current index values
top_customers['Index'] = top_customers.index
LE_Customers['Index'] = LE_Customers.index

In [383]:
if len(LE_Customers) != len(top_customers):
    raise SystemExit("Mismatch in number of customers")

In [384]:
LE_Customers = LE_Customers.drop_duplicates().copy()

In [385]:
# Merge based on Index
top_customers = pd.merge(
    top_customers,
    LE_Customers,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [386]:
top_customers.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

### Create Warehouses lookup table

In [387]:
LE_Warehouses = LE_Warehouses[['WAREHOUSEID']].copy()

In [388]:
LE_Warehouses.rename(columns={'WAREHOUSEID': 'site_id_LE'}, inplace=True)

In [389]:
# Create a new column 'Index' in both dataframes with current index values
top_warehouses['Index'] = top_warehouses.index
LE_Warehouses['Index'] = LE_Warehouses.index

In [390]:
if len(LE_Warehouses) != len(top_warehouses):
    raise SystemExit("Mismatch in number of warehouses")

In [391]:
# Merge based on Index
top_warehouses = pd.merge(
    top_warehouses,
    LE_Warehouses,
    how="left",
    on="Index",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [392]:
top_warehouses.drop(columns={'OrderLinesCount', 'Index'}, inplace=True, axis=1)

In [393]:
top_warehouses

Unnamed: 0,site_id,site_id_LE
0,ZA022,MZ010B
1,ZA032,MZ010Q
2,ZA010,MZ011B
3,ZA014,MZ011Q
4,ZA020,MZ012B
5,ZA027,MZ012Q
6,ZA012,MZ013B


### Merge Warehouses lookup table with SAP data

In [394]:
df5_1 = pd.merge(
    df5_W_C_P,
    top_warehouses,
    how="left",
    on="site_id",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [395]:
df5_W_C_P.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31442 entries, 3073 to 871548
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ActivityType              31442 non-null  object
 1   Order Number              31442 non-null  int64 
 2   Material_Description      31442 non-null  object
 3   BaseDate                  31442 non-null  object
 4   BaseHour                  31442 non-null  int64 
 5   Cases                     31442 non-null  int64 
 6   Bill I                    31393 non-null  object
 7   Source Channel            31442 non-null  object
 8   Order Category            31442 non-null  object
 9   D365_ItemNo               31442 non-null  object
 10  Material No               31442 non-null  object
 11  MOD                       31442 non-null  int64 
 12  site_id                   31442 non-null  object
 13  WAREHOUSELOCATIONID       31442 non-null  object
 14  D365_Del_Loc           

In [396]:
df5_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31442 entries, 0 to 31441
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ActivityType              31442 non-null  object
 1   Order Number              31442 non-null  int64 
 2   Material_Description      31442 non-null  object
 3   BaseDate                  31442 non-null  object
 4   BaseHour                  31442 non-null  int64 
 5   Cases                     31442 non-null  int64 
 6   Bill I                    31393 non-null  object
 7   Source Channel            31442 non-null  object
 8   Order Category            31442 non-null  object
 9   D365_ItemNo               31442 non-null  object
 10  Material No               31442 non-null  object
 11  MOD                       31442 non-null  int64 
 12  site_id                   31442 non-null  object
 13  WAREHOUSELOCATIONID       31442 non-null  object
 14  D365_Del_Loc          

### Merge Products lookup table with SAP data

In [397]:
df5_2 = pd.merge(
    df5_1,
    top_products,
    how="left",
    on="D365_ItemNo",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [398]:
df5_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31442 entries, 0 to 31441
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ActivityType              31442 non-null  object
 1   Order Number              31442 non-null  int64 
 2   Material_Description      31442 non-null  object
 3   BaseDate                  31442 non-null  object
 4   BaseHour                  31442 non-null  int64 
 5   Cases                     31442 non-null  int64 
 6   Bill I                    31393 non-null  object
 7   Source Channel            31442 non-null  object
 8   Order Category            31442 non-null  object
 9   D365_ItemNo               31442 non-null  object
 10  Material No               31442 non-null  object
 11  MOD                       31442 non-null  int64 
 12  site_id                   31442 non-null  object
 13  WAREHOUSELOCATIONID       31442 non-null  object
 14  D365_Del_Loc          

### Merge Customers lookup table with SAP data

In [399]:
df5_3 = pd.merge(
    df5_2,
    top_customers,
    how="left",
    on="D365_Cust_No",
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,
).copy()

In [400]:
df5_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31442 entries, 0 to 31441
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   ActivityType              31442 non-null  object
 1   Order Number              31442 non-null  int64 
 2   Material_Description      31442 non-null  object
 3   BaseDate                  31442 non-null  object
 4   BaseHour                  31442 non-null  int64 
 5   Cases                     31442 non-null  int64 
 6   Bill I                    31393 non-null  object
 7   Source Channel            31442 non-null  object
 8   Order Category            31442 non-null  object
 9   D365_ItemNo               31442 non-null  object
 10  Material No               31442 non-null  object
 11  MOD                       31442 non-null  int64 
 12  site_id                   31442 non-null  object
 13  WAREHOUSELOCATIONID       31442 non-null  object
 14  D365_Del_Loc          

In [401]:
df5_3.drop(columns={'site_id', 'D365_Cust_No', 'Material_Description', 'D365_ItemNo'}, inplace=True, axis=1)

In [402]:
df5_3.rename(columns={'site_id_LE': 'site_id', 'D365_Cust_No_LE': 'D365_Cust_No', 'Material_Description_LE': 'Material_Description', 'D365_ItemNo_LE': 'D365_ItemNo'}, inplace=True)

In [403]:
path = './data/'+ LE + '/15Dec_D365_orders_' + LE + '.feather'
df5_3.to_feather(path)