# Load packages

In [1]:
import pandas as pd
import polars as pl
import xlsx2csv
from pandas import DataFrame
import numpy as np
from pathlib import Path
from glob import glob
import pyodbc
import datetime
from datetime import datetime as dt
from typing import Union, List
import zipfile
from zipfile import ZipFile
import matplotlib.pyplot as plt
import csv

import dttlib
from importlib import reload
reload(dttlib)
from dttlib.data.reading import read_data
from dttlib.data.uploading import DataFrameUploader

import swifter
import pickle
import re
import os

import PyPDF2
import tabula
import camelot

=========================================================================================================================

# Code viewing settings

In [2]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)
#pd.set_option('display.width', None)
#np.set_printoptions(threshold=np.inf)
#pd.set_option('display.float_format', lambda x: '%.5f' % x)

=========================================================================================================================

# Define working directory

In [3]:
master_path=Path(r'')
raw_path=master_path/'2. Original Data'
input_path=master_path/'3. Pre-cleaned Data'
output_path=master_path/'4. Cleaned data'

=========================================================================================================================

# Read data

## Read delimitered data using dttlib

In [None]:
df=read_data(r'',
             file_extension='',
             header=0,
             delimiter='|',
             encoding='utf-8',
             quoting=csv.QUOTE_NONE,
#             error_bad_lines=False,
#             engine='python'
            )

#df.insert(0, 'DTT_ID', range(1, 1 + len(df)))


print(df.shape,'\n')

print(df.dtypes,'\n')

df.head(3)


## Read delimitered data

In [None]:
list_dir=glob(fr'{raw_path}/actt_1/lines/*.actt')
list_dir

In [None]:
df=pd.concat([pd.read_csv(f,
                          encoding='utf-8',
                          sep=',',
                          dtype = str,
                          header=0,
                          index_col=None,
                          engine='c',
#                         delim_whitespace=True,error_bad_lines=False,engine='python',quoting=csv.QUOTE_NONE
                        ) for f in list_dir],ignore_index=True) 
# if given error: cannot reindex from a duplicate axis, needs to put ignore_index=True

print(df.shape,'\n')

print(df.dtypes,'\n')

df.head(3)

## Read fixed width

In [None]:
# get files
dir=glob(fr'{}/*.txt')
dir

In [None]:
colspecs = [(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,),(,)]


df=pd.concat([pd.read_fwf(fp,dtype=str,index_col=None,header=None,colspecs=colspecs,names=['','',''],
#                          skiprows=28,skipfooter=20,
                         ).assign(DTT_FILENAME=os.path.basename(fp)) for fp in dir],ignore_index=True) 

df.insert(0, 'DTT_ID', range(1, 1 + len(df)))

#df.insert(1, 'Ledger', df['DTT_FILENAME'].apply(lambda x: x.split('_')[5]).str.replace('.txt','').str.strip())

#df=df[(~df['Account'].isnull()) &\
#        (~df['Account'].isin(['Currenc', 'Ledger', 'Account','-------','TOTA','T','\x0c']))]

print(df.shape,'\n')

print(df.dtypes,'\n')

df.head(3)

## Read fixed width alternatives

In [None]:
df=pd.read_fwf(input_path/'',header=None)

print(df.shape,'\n') 

print(df.dtypes,'\n')

df.head(21)

In [None]:
df[1].str.split('\t', expand=True)

In [None]:
splits = [df[col].str.split(pat='\t', expand=True).add_prefix(col) for col in df.columns]
clean_df = pd.concat(splits, axis=1)

=========================================================================================================================

# Pre processing

## Trim and remove double quotes

In [None]:
%%time

cols=df.select_dtypes(['object']).columns
df[cols]=df[cols].apply(lambda x: x.str.replace('"','').str.strip())

## Rename NaN columns

In [None]:
df=df.set_axis([,'DTT_FILENAME'],axis='columns',inplace=False)

## Union

In [None]:
df_union=pd.concat([df,df,df], ignore_index=True)

print(df_union.shape,'\n')

print(df_union.dtypes,'\n')

df_union.head(2)

In [None]:
#df_union.to_csv(input_path/'.csv',sep='|',index=False) #quoting=csv.QUOTE_ALL,

## Join

In [None]:
df_join=pd.merge(df,df,how='left',suffixes=('', '_y'),
                 left_on=[''],
                 right_on=[''])

print(df_join.shape,'\n')

print(df_join.dtypes,'\n')

df_join.head(2)

In [None]:
#df_join.to_csv(input_path/'.csv',sep='|',index=False) #quoting=csv.QUOTE_ALL,

=========================================================================================================================

# Upload data to database using dttlib

In [None]:
uploader=DataFrameUploader(server='', database='')
uploader.upload(df,table_name='',initials='',pk='DTT_ID',overwrite=True,nvarchar_size='500')

=========================================================================================================================

# Pre-validate

## Check rowcounts

In [None]:
print(df['DTT_FILENAME'].value_counts(),'\n')
print('Total row count:',sum(df['DTT_FILENAME'].value_counts()))

## Check missing values

In [None]:
df.isna().sum()

## Check analysis period

In [None]:

print(
    'Date ranges from ',
    pd.to_datetime(df[''],format='%Y-%m-%d').min(),
    ' to ',
    pd.to_datetime(df[''],format='%Y-%m-%d').max()
    )

## Check consistencies

In [None]:
df.sort_values(by=[''],ascending = True)[''].unique()

## Check sum

In [None]:
df=df[~df[''].isin([''])]

print(df.shape)

df.head(2)

In [None]:
mask=df['Amount'].isnull()
df.loc[mask,'Amount']='0.00'

In [None]:
mask=df[''].str.endswith('-')
df.loc[mask,'']= '-' + df[''].str.rstrip('-')

In [None]:
# convert amount as float
df['Amount']=df['Amount'].str.replace(',','').astype('float') * df[''].apply(lambda x: -1 if x =='H' else 1)

# breakdowns
print('Total S (Debit):',round(df[df['']<0][''].sum(),2),'\n')
print('Total H (Credit):',round(df[df['']>0][''].sum(),2))

# check sum
print(round(df['Amount'].sum(),2),'\n')

print(round(df.groupby('DTT_FILENAME').agg({'Amount': 'sum'}),2).rename(columns={'Amount':'Sum'}))

## Generate movement by gl

In [None]:
# sum by gl based on the raw data
sum_by_xx_raw=round(df.groupby('Account Number').agg({'Amount': 'sum'}).rename(columns={'Amount':'Sum'}),2).reset_index()
sum_by_xx_raw

In [None]:
# export if needed
#sum_by_xx_raw.to_excel(output_path/'sum_by_gl_raw.xlsx',index=False)

=========================================================================================================================

# Clean

## Format column headers

In [None]:
df.columns=df.columns.str.title()
df.columns=df.columns.str.upper()
df.columns=df.columns.str.lower()
df.columns=df.columns.str.replace('','')

## Clean date format

### Normal case

In [None]:
# alternatively
df['']=pd.to_datetime(df[''],format='%Y/%m/%d %H:%M:%S')

### Special case 1

In [None]:
df[df[''].str.contains('31/',na=False)].shape

In [None]:
df[''].str[:]

In [None]:
mask1=df[''].str.contains('31/',na=False)
df.loc[mask1,'']='28'+df['RETIREMENT DATE'].str[3:]

In [None]:
df[df[''].str.contains('-30',na=False)].shape

In [None]:
mask1=df[''].str.contains('-30',na=False)
df.loc[mask1,'']=df[''].str[:8]+'28'

In [None]:
print(pd.to_datetime(df['BKPF-BUDAT'],format='%Y%m%d').min(),
pd.to_datetime(df['BKPF-BUDAT'],format='%Y%m%d').max())

### Case like '08/07/20'

In [None]:
df[['day','month','year']]=df[''].str.split('/',expand=True)

In [None]:
mask_year_length=df['year'].str.len()==2
df.loc[mask_year_length,'year'] = '20' + df['year']

In [None]:
df['']=df['year'].map(str) + '-' + df['month'].map(str) + '-' + df['day'].map(str)
df['']=df['Date'].astype('datetime64')
df['']

### Case like '44013'

In [None]:
df['']=dt.datetime(1899, 12, 30) + pd.to_timedelta(df[''].astype(int), 'D')
df['']

### Julian date

In [None]:
df['']=pd.to_datetime(df[''],format='1%y%j')
df['']

## Concatenate

In [None]:
df['']=df[''] + '_' + df[''] + '_' + df[''] + '_' + df['']

## Mask

In [None]:
mask=df[''].xxxx
df.loc[mask,'']=''

## Generate Unique Line Number

In [None]:
df.insert(0, '', range(1, 1 + len(df)))

=========================================================================================================================

# Map columns

In [None]:
# rename a few columns
col_name_raw = ['','','','','','','','','','']

col_mapping = {'':'',
               '':'',
               '':'',
               '':'',
               '':'',
               '':'',
               '':'',
               '':'',
               '':'',
              }

col_name_new = list(col_mapping.values())

col_name_new

In [None]:
df_subset=df[col_name_raw].rename(columns=col_mapping)[col_name_new]

print(df_subset.shape,'\n')

print(df_subset.dtypes,'\n')

df_subset.head(2)

## Alternatively

In [None]:
col=pd.read_excel(master_path/'JET Column Mappings - SAP.xlsx','Sheet2')

In [None]:
col_mapping=dict(zip(col['Column to use'],col['InsightBox column name']))

col_mapping

# Create exp tables

In [None]:
# subset dataframe for myA required columns
df_subset=df[col_mapping.keys()].rename(columns=col_mapping)

print(df_subset.shape,'\n')

print(df_subset.dtypes,'\n')

df_subset.head(2)

=========================================================================================================================

# Post validation after cleaning

## Validate date range

In [None]:
print('Date ranges from ',df_subset[''].min(), ' to ', df_subset[''].max())
print('Date ranges from ',df_subset[''].min(), ' to ', df_subset[''].max())

## Validate sum

In [None]:
round(df_subset['Amount'].sum(),2)

## Validate sum by xxx

In [None]:
sum_by_xxx=round(df_subset.groupby('').agg({'':'count','Amount': 'sum'}),2)

unbalanced=sum_by_xxx[sum_by_xxx['Amount'].abs()>0].rename(columns={'':'No._of_lines','Amount':'Sum'})

unbalanced

## Validate uniqueness between Journal ID and Line Number

In [None]:
check_unique=df_subset.groupby(['Journal ID','Journal line number']).size()
check_unique[check_unique>1].to_frame()#.shape

## Validate if DocType, Users, DateEffect and DatePosted are unique on Journal ID

In [None]:
unique_jID_doc=df_subset.groupby('Journal ID').agg({'Document Type': lambda w: w.nunique()})
unique_jID_doc[unique_jID_doc['Document Type']>1]

In [None]:
unique_jID_user=df_subset.groupby('Journal ID').agg({'Posting User': lambda x: x.nunique()})
unique_jID_user[unique_jID_user['Posting User']>1]

In [None]:
unique_jID_e_date=df_subset.groupby('Journal ID').agg({'Date Effective': lambda y: y.nunique()})
unique_jID_e_date[unique_jID_e_date['Date Effective']>1]

In [None]:
unique_jID_p_date=df_subset.groupby('Journal ID').agg({'Date Posted': lambda z: z.nunique()})
unique_jID_p_date[unique_jID_p_date['Date Posted']>1]

## Final check on GL acc format

In [None]:
df_subset.sort_values(by=[''], ascending = True)[''].unique()

=========================================================================================================================

# Generate movement by gl

In [None]:
sum_by_gl=df_subset.groupby('GL Account').agg({'Journal Line Number': 'count','Amount': 'sum'}
                                             ).rename(columns = {'Journal Line Number':'Count','Amount':'Sum'})
                                             
sum_by_gl['Sum']=round(sum_by_gl['Sum'],2)

sum_by_gl

In [None]:
# export
#sum_by_gl.to_excel(output_path/'movement_by_gl.xlsx',index=False)

=========================================================================================================================

# Export cleaned data

## Generate final tables

In [None]:
df_final=df_subset.drop(columns=['',''])

print(df_final.shape,'\n') # , maintained

print(df_final.dtypes,'\n')

df_final.head(3)

In [None]:
# export
#df_final.to_csv(output_path/'journal_lines.csv',sep=',',quoting=csv.QUOTE_ALL,index=False)

## Generate doc types

In [None]:
doc_type=df_final.groupby('Document Type').agg({'Journal ID': lambda x: x.nunique(),
                                                'Journal Line Number': 'count',
                                                'Amount': 'sum'}
                                              ).rename(columns = {'Journal ID':'Number of journal IDs',
                                                                  'Journal Line Number':'Number of journal lines',
                                                                  'Amount':'Total amount'}).reset_index()

doc_type['Is standard document type']=0

doc_type['Total amount']=round(doc_type['Total amount'],2)

doc_type

In [None]:
# export
#doc_type.to_csv(output_path/'document_types.csv',sep=',',quoting=csv.QUOTE_ALL,index=False)

## Generate posting users

In [None]:
posting_user=df_final.groupby('Posting User').agg({'Journal ID': lambda x: x.nunique(),
                                                   'Journal Line Number': 'count',
                                                   'Amount': 'sum'}
                                                 ).rename(columns = {'Journal ID':'Number of journal IDs',
                                                                     'Journal Line Number':'Number of journal lines',
                                                                     'Amount':'Total amount'}).reset_index()

posting_user['Is system entry']=0

posting_user['User of interest']=0

posting_user['Total amount']=round(posting_user['Total amount'],2)

posting_user

In [None]:
# export
#posting_user.to_csv(output_path/'posting_user.csv',sep=',',quoting=csv.QUOTE_ALL,index=False)

=========================================================================================================================

# Extractions

## Import xxx list

In [None]:
df_xxx_list=read_data(input_path,file_extension='',encoding='utf-8',delimiter='|',dtype=str,header=0,index_col=None
#                      error_bad_lines=False,engine='python',quoting=csv.QUOTE_NONE
            )

print(df_xxx_list.shape,'\n') # 

df_xxx_list.head(2)

## Import xxx list if xlsx

In [None]:
xxx_list_dir=glob(fr'{}/*.xlsx')
xxx_list_dir

In [None]:
df_xxx_list=pd.concat([pd.read_excel(fp,dtype=str,index_col=None,header=0)\
              .assign(DTT_FILENAME=os.path.abspath(fp)) for fp in xxx_list_dir]) 
# can use os.path.basename to just get the name of file instead of full path

print(df_xxx_list.shape,'\n') 

print(df_xxx_list.dtypes,'\n') 

df_xxx_list.head()

In [None]:
#df.to_csv(input_path/'.csv',sep='|',index=False) #quoting=csv.QUOTE_ALL,

## Extract xxx only related journals

In [None]:
df_xxx_only=pd.merge(df_final,df_xxx_list,how='inner',suffixes=('', '_y'),
                     left_on=[''],right_on=['']
                    )

df_xxx_only=df_xxx_only[['','','','']] # or df_xxx_only=df_xxx_only.drop(columns=['',''])

print(df_xxx_only.shape,'\n')

print(df_xxx_only.dtypes,'\n') 

df_xxx_only.head(3)

In [None]:
#df_xxx_only[(df_xxx_only['']>='') & (df_xxx_only['']<='')].to_csv(output_path/'.csv',quoting=csv.QUOTE_ALL,index=False)

In [None]:
#df_xxx_only[(df_xxx_only['']>='') & (df_xxx_only['']<='')].to_excel(output_path/'.xlsx',index=False)

## Extract journals touch xxx account

In [None]:
df_xxx_unique_jIDs=pd.DataFrame(df_xxx_only['Journal ID'].unique())

print(df_xxx_unique_jIDs.shape,'\n')

print(df_xxx_unique_jIDs.dtypes,'\n')

df_xxx_unique_jIDs.head(3)

In [None]:
df_xxx_all=pd.merge(df_xxx_unique_jIDs,df_final, how='inner',left_on=[''],right_on=['Journal ID'])

df_xxx_all=df_xxx_all[['','','','']] # or df_xxx_all=df_xxx_all.drop(columns=['',''])

print(df_xxx_all.shape,'\n')

print(df_xxx_all.dtypes,'\n')

df_xxx_all.head(3)

In [None]:
#df_xxx_all[(df_xxx_all['']>='') & (df_xxx_all['']<='')].to_csv(output_path/'.csv',quoting=csv.QUOTE_ALL,index=False)

In [None]:
#df_xxx_all[(df_xxx_all['']>='') & (df_xxx_all['']<='')].to_excel(output_path/'.xlsx',index=False)

=========================================================================================================================

    The end