# Standard SAP JET Cleaning Script

=========================================================================================================================

## Statement
     SAP ERP typically generates two files for JET analysis purpose
         1. BKPF - Header file
         2. BSEG - Journal Line file
     due to size issue, sometimes, audit team will provide mutiple files for each kind, 
     for instance in this example, there will be 2 BKPF files and 2 BSEG files

=========================================================================================================================

## Import Essential Packages
    These are common libraries and packages used across most SAP jobs

In [None]:
import pandas as pd
from pandas import DataFrame

import csv

import numpy as np

from pathlib import Path

from glob import glob

import datetime
from datetime import datetime

import pyodbc

In [None]:
df=pd.DataFrame()

In [None]:
class DataFrameID:
    def __init__(self, df):
        self.df = df_headers
        
    def AddID(self):
        self.df['DTT_ID'] = np.arange(len(df_headers))

=========================================================================================================================

## Load Tables

#### load all header tables

aka. BKPF

The following code can handle any number of header files.

Practically, for any DS jobs, the first thing to do is see if data can be loaded properly, if there are errors loading, more
precleaning is needed (e.g. fixes for delimiters or unexpected newlines -- see fix_delimiter.py in Common Code)

In [None]:
input_data_path = r'C:\PlaceWhereFilesAre'

In [None]:
# Extract just the header files from the raw data
# Update this path to match the location of the input files and simplify the glob if they don't have "JEHeaders" file names
headers_dir=glob(r'')
headers_dir

In [None]:
# import header tables
headers=[pd.read_csv(f,sep = ',',dtype = str,encoding='utf-8',header=0,index_col=None, engine='c'                     
                     #quoting=csv.QUOTE_NONE,
                   #error_bad_lines=False,engine='python'
                  ) for f in headers_dir]

In [None]:
df_headers=pd.concat(headers, ignore_index=True)
df_headers.shape # rowcount

In [None]:
df_headers.head(200)

#### load all journal line tables

aka. BSEG

In [None]:
# Extract the line files from the raw data
# Update this path to match the location of the input files and simplify the glob if they don't have "JEDetails" file names
lines_dir=glob(r'{input_data_path}\je_JEDetails_*.txt')
lines_dir

In [None]:
# import line tables
lines=[pd.read_csv(f,
                   sep = '|', # change to correct delimiter where needed
                   dtype = str,
                   encoding='utf-8', # try 'windows-1252' if 'utf-8' returns error
                   header=0, # can comment out where needed
                   index_col=None,
                   engine='c',
                   quoting=csv.QUOTE_NONE, # If there are quotes used as text qualifiers, change this setting
#                   error_bad_lines=False, # used to check and return bad lines
#                   engine='python', # used to return bad lines with clear line by line formality
#                   low_memory=False # if engine='c' is on, low_memory needs to be commented off
                  ) for f in lines_dir]

In [None]:
df_lines=pd.concat(lines, ignore_index=True) # transform list to dataframe
df_lines.shape # original rowcount

In [None]:
df_lines.head(2)

#### remove leading and trailing space

In [None]:
df_headers.dtypes

In [None]:
df_lines.dtypes

In [None]:
# remove all leading and trailing space for all values in all columns for both headers and lines
cols_headers=df_headers.select_dtypes(['object']).columns
df_headers[cols_headers]=df_headers[cols_headers].apply(lambda x: x.str.strip())

cols_lines=df_lines.select_dtypes(['object']).columns
df_lines[cols_lines]=df_lines[cols_lines].apply(lambda x: x.str.strip())

In [None]:
# remove space in column headers in both headers and lines
df_headers.columns = df_headers.columns.str.replace(' ', '') 

df_lines.columns = df_lines.columns.str.replace(' ', '') 

In [None]:
df_headers.dtypes

In [None]:
df_lines.dtypes

In [None]:
# sense check
df_lines[df_lines['SHKZG']=='H'].head(2)

#### Join header to journal line tables

Ideally, join header and line tables right after loading raw header and line tables.

Unless both tables involve large number of columns (like 250+ columns) and massive number of rows (50 million rows)
you then might subset raw data based on column mapping to reduce the size of file and then join

In [None]:
# join tables and check rowcounts
df_joined=pd.merge(df_lines, df_headers, how='left', # left or inner join depending on the needs
                   left_on=['MANDT','BUKRS','BELNR','GJAHR'],
                   right_on=['MANDT','BUKRS','BELNR','GJAHR'],
                   suffixes=('', '_y'))
df_joined.shape # rowcount should be the same as line table original rowcount above unless there are filtering conditions

In [None]:
df_joined.head(2)

In [None]:
# check missing values and how many there are in each column
df_joined.isna().sum()

=========================================================================================================================

## Locate Essential Paths

Set local file paths from this step forward, as the following file circulation will be consistently 
saved under one of those paths.

This is the only work directory (path) needed to be changed to your current master path if needed.

In [None]:
# get master file path and output path
master_path=Path(r'C:\Users\...') # must change the path depending where you put your data
output_path=master_path/'3.Cleaned' # adjust the folder name where needed

## Prepare Column Mapping

In [None]:
# This step does not affect the rest of the script, it's just to remind you to look at the column mapping
# hard coded path, no need to change this path to bring out the standard SAP column mapping
col_mapping_file = pd.read_excel(r'')
col_mapping_file

In [None]:
# create a list including needed column names]
col_header_name = ['BUKRS', # COMPANY_CODE
                   'BUZEI', # LINE_ITEM
#                   'DTT_ID', # only needed when BUZEI not unique
                   'Journal ID', # Journal ID is consist of 4 columns
                   'Journal description', # Journal desciption is consist of 2 columns with conditions
                   'HKONT', # GL_ACCOUNT_NUMBER
                   'Amount', # DMBTR * SHKZG
                   'BUDAT', # POSTING_DATE
                   'CPUDT', # ENTRY_DATE
                   'CUKY', # optional
                   'Foreign currency amount', # WRBTR * SHKZG
                   'BLART', # DOCUMENT_TYPE
                   'USNAM',
                   'TCODE'
                  ] # POSTING_USER

In [None]:
col_mapping = {'BUKRS': 'Entity',
               'BUZEI': 'Journal line number',
#               'DTT_ID': 'Journal line number manual'
               'Journal ID': 'Journal ID',
               'Journal description': 'Journal description',
               'HKONT': 'GL account',
               'Amount': 'Amount',
               'BUDAT': 'Date effective',
               'CPUDT': 'Date posted',
               'CUKY': 'Currency code',
               'Foreign currency amount': 'Foreign currency amount',
               'BLART': 'Document type',
               'USNAM': 'Posting user',
               'TCODE': 'Transaction code'
              }

In [None]:
col_header = list(col_mapping.values())
col_header

=========================================================================================================================

## Wrangle Data

This section is to wrangle and transform data into required format.
Pre-validation also included in this section

#### deal with date format and pre validate date range

In [None]:
# check date effective range
print(pd.to_datetime(df_joined['BUDAT']).min(), 
      pd.to_datetime(df_joined['BUDAT']).max())

In [None]:
# if looks irregular format or date range not making sense
# format
df_joined['BUDAT']=df_joined['BUDAT'].astype('datetime64') # normally .astype should work perfectly

# otherwise, can use pd.to_datetime with more detailed formating and add errors='coerce' where needed
# df_joined['BUDAT']=df_joined['BUDAT'].apply(pd.to_datetime,format='%Y%m%d') # format needs to be altered where needed

In [None]:
# check date effective range again after conversion
print(df_joined['BUDAT'].min(), 
      df_joined['BUDAT'].max())

In [None]:
# also check date posted format
print(pd.to_datetime(df_joined['CPUDT']).min(), 
      pd.to_datetime(df_joined['CPUDT']).max())

In [None]:
# if looks irregular format or date range not making sense
# format
df_joined['CPUDT']=df_joined['CPUDT'].astype('datetime64') # normally .astype should work perfectly

# otherwise, can use pd.to_datetime with more detailed formating and add errors='coerce' where needed
# df_joined['BUDAT']=df_joined['BUDAT'].apply(pd.to_datetime,format='%Y%m%d') # format needs to be altered where needed

In [None]:
# check date posted format again after conversion
print(df_joined['CPUDT'].min(), 
      df_joined['CPUDT'].max())

#### deal with amount and foreign currency amount format

In [None]:
# check missing values in DMBTR column
df_joined[df_joined['DMBTR'].isnull()].shape

In [None]:
# convert DBMTR as float
df_joined['DMBTR']=df_joined['DMBTR'].astype('float')

In [None]:
# convert indicator and create Amount column
df_joined['indicator']=df_joined['SHKZG'].apply(lambda x: -1 if x =='H' else 1 )
df_joined['Amount']=df_joined['indicator']*df_joined['DMBTR']

In [None]:
# check netting to 0
round(df_joined['Amount'].sum(),2)

In [None]:
# same process for foreign currency amount

df_joined['WRBTR']=df_joined['WRBTR'].astype('float')

In [None]:
df_joined['Foreign currency amount']=df_joined['indicator']*df_joined['WRBTR'] 
# it may have more conditions when calculating foreign currency amount, so pay attention and modify accordingly

#### form correct journal ID column

In [None]:
# check if any missing values in journal ID major column
df_joined[df_joined['BELNR'].isnull()].shape

In [None]:
# concate columns to form journal ID BELNR
df_joined['Journal ID']=df_joined['MANDT'].map(str) + '_' + df_joined['BUKRS'].map(str) + '_' + df_joined['BELNR'].map(str) + '_' + df_joined['GJAHR'].map(str)
df_joined['Journal ID'].head(2)

#### form correct journal description column

In [None]:
# concate columns to form journal description
df_joined['Journal description']=df_joined['BKTXT'] + ' ' + df_joined_anx['SGTXT']
df_joined[df_joined['Journal description'].notnull()].head(2)

In [None]:
# remove double quotes in journal description
df_joined['Journal description'] = df_joined['Journal description'].str.replace('"','')

#### assign 'N/A' as value only when there is no BLART column

In [None]:
# df_joined['BLART']='N/A'

#### assign 'N/A' as value only when there is no USNAM column

In [None]:
# df_joined['USNAM']='N/A'

=========================================================================================================================

## Map columns to subset JET required data

In [None]:
# subset dataframe for myA required columns
df_subset=df_joined[col_header_name].rename(columns=col_mapping)[col_header]
df_subset.head(2)

In [None]:
# review of data types of each column after formatting
df_subset.dtypes

=========================================================================================================================

## Post Validation

#### check data range again

In [None]:
print(df_subset['Date effective'].min(), df_subset['Date effective'].max())
# within required analysis range

#### check netting to 0 again

In [None]:
round(df_subset['Amount'].sum(),2)

#### check netting to 0 by each Entity

In [None]:
sum_by_entity=df_subset.groupby('Entity').agg({'Amount': 'sum'})
round(sum_by_entity,2)

#### check nets to 0 by each journal ID

In [None]:
# net by [Journal ID]
net_by_jID=round(df_subset.groupby('Journal ID').agg({'Journal line number':'count',
                                                      'Amount': 'sum'}),2)
net_by_jID.head(2)

In [None]:
# bring unbalanced journal id out
unbalanced=net_by_jID[net_by_jID['Amount'].abs()>0].rename(columns={'Journal line number':'Count',
                                                                        'Amount':'Sum'})
unbalanced

#### check uniqueness

In [None]:
# check duplications on [Journal ID] and [Journal Line Number]
check_unique=df_subset.groupby(['Journal ID','Journal line number']).size()
check_unique.head(2)

In [None]:
# bring up duplications
check_unique[check_unique>1]#.to_frame().shape

    if large data size, group all aggregations together will take fairly long time to run, so can run it separately

In [None]:
# check uniqueness upon grouped journal ID
# unique_cols=df_subset.groupby('Journal ID').agg({'Document type': lambda w: w.nunique(),
#                                                  'Posting user': lambda x: x.nunique(),
#                                                  'Date effective': lambda y: y.nunique(),
#                                                  'Date posted': lambda z: z.nunique()})
# unique_cols

In [None]:
unique_jID_doc=df_subset.groupby('Journal ID').agg({'Document type': lambda w: w.nunique()})
unique_jID_doc[unique_jID_doc['Document type']>1]

In [None]:
unique_jID_user=df_subset.groupby('Journal ID').agg({'Posting user': lambda x: x.nunique()})
unique_jID_user[unique_jID_user['Posting user']>1]

In [None]:
unique_jID_e_date=df_subset.groupby('Journal ID').agg({'Date effective': lambda y: y.nunique()})
unique_jID_e_date[unique_jID_e_date['Date effective']>1]

In [None]:
unique_jID_p_date=df_subset.groupby('Journal ID').agg({'Date posted': lambda z: z.nunique()})
unique_jID_p_date[unique_jID_p_date['Date posted']>1]

#### check double quotes

For large datasets, if double quotes have been addressed prior loading in, can skip this step as it could take very long time

In [None]:
# check double quotes in all columns
check_quotes=np.column_stack([df_subset[col].astype(str).str.contains('"', na=False) for col in df_subset])
df_subset.loc[check_quotes.any(axis=1)].head(2)

#### check if any Irregular GL account format

In [None]:
# overview all GL account to spot any irregular GL account format
df_subset['GL account'].unique()

=========================================================================================================================

## Generate EXP Tables

In [None]:
# re order columns if needed
df_final=df_subset.reindex(columns=col_header)

In [None]:
# final rowcount check
df_final.shape

#### movement by gl

In [None]:
# create sum_by_gl.xlsx
sum_by_gl=df_final.groupby('GL account').agg({'Journal line number': 'count',
                                              'Amount': 'sum'}
                                            ).rename(columns = {'Journal line number':'Count',
                                                                'Amount':'Sum'})
sum_by_gl['Sum']=round(sum_by_gl['Sum'],2)
sum_by_gl

In [None]:
# export
sum_by_gl.to_excel(output_path/'sum_by_gl.xlsx')

#### document types

In [None]:
doc_type=df_final.groupby('Document type').agg({'Journal ID': lambda x: x.nunique(),
                                                'Journal line number': 'count',
                                                'Amount': 'sum'}
                                              ).rename(columns = {'Journal ID':'Number of entries',
                                                                  'Journal line number':'Number of lines',
                                                                  'Amount':'Total amount'}).reset_index()
doc_type['Is standard document type'] = 0
doc_type['Total amount']=round(doc_type['Total amount'],2)
doc_type

In [None]:
# export
doc_type.to_csv(output_path/'document_type.csv',quoting=csv.QUOTE_ALL,index=False)

#### posting user

In [None]:
posting_user=df_final.groupby('Posting user').agg({'Journal ID': lambda x: x.nunique(),
                                                   'Journal line number': 'count',
                                                   'Amount': 'sum'}
                                                 ).rename(columns = {'Journal ID':'Number of entries',
                                                                     'Journal line number':'Number of lines',
                                                                     'Amount':'Total amount'}).reset_index()
posting_user['Is system entry'] = 0
posting_user['User of interest'] = 0
posting_user['Total amount']=round(posting_user['Total amount'],2)
posting_user

In [None]:
# export
posting_user.to_csv(output_path/'posting_user.csv',quoting=csv.QUOTE_ALL,index=False)

#### journal lines

In [None]:
# final review of journal lines
df_final.head(2)

In [None]:
# export
df_final.to_csv(output_path/'journal_lines.csv',quoting=csv.QUOTE_ALL,index=False)

=========================================================================================================================

# End of Script