In [2]:
import os
import glob 
import pandas as pd
from tqdm import tqdm
import xlsxwriter
import re
import time

import warnings
warnings.filterwarnings("ignore")

In [3]:
# Allowing the user to select the dataset so we can locate the respective folder:
dataset = input("Enter the dataset to review from the Data Auditor: ")
print("The user would like to use the dataset: ", dataset)

The user would like to use the dataset:  Characteristics


In [4]:
# Vehicles that we're interested in are being listed here:

# Core Fixed Income	                    12776	P73285
# Core Plus Fixed Income	            12777	P74285 
# Global Quality Value	                12783	P85285
# Strategic Fixed Income	            12811	P121285
# Strategic Fixed Income Opportunities	12812	P126285
# US Small Cap Core	                    12823	P147285

sheet_names = ['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']

In [5]:
# Setting file path. We'll be opening first the Performance folder:
absolute_path = "C:/Users/l.arguello/Downloads/Manulife_DataAuditor/"

file_path = absolute_path + dataset

In [6]:
# Using glob to get all the Excel file names in the selected folder, to loop through them:

csv_files = glob.glob(os.path.join(file_path, "[!~]*.xlsx")) 
  
file_names = []

# Loop over the list of Excel files: 
for f in tqdm(csv_files, desc="Loading…",ascii=False, ncols=75):

        time.sleep(0.03) 
        # Print the location and filename 
        print('File Name:', f.split("\\")[-1]) 
        # Add each Excel file name to file_names list 
        file_names.append(f.split("\\")[-1])      
 
print("Complete.")

Loading…:   0%|                                     | 0/15 [00:00<?, ?it/s]

Loading…:  40%|███████████▌                 | 6/15 [00:00<00:00, 27.85it/s]

File Name: Data_Audit_Report_Albourne_Moatspace_1_2024.xlsx
File Name: Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
File Name: Data_Audit_Report_Broadridge_1_2024.xlsx
File Name: Data_Audit_Report_Callan_Assoc_1_2024.xlsx
File Name: Data_Audit_Report_Camradata_1_2024.xlsx
File Name: Data_Audit_Report_eVestment_Alliance_1_2024.xlsx


Loading…:  80%|██████████████████████▍     | 12/15 [00:00<00:00, 27.43it/s]

File Name: Data_Audit_Report_Global_Fund_Search_1_2024.xlsx
File Name: Data_Audit_Report_Global_Manager_Research_1_2024.xlsx
File Name: Data_Audit_Report_Investment_Metrics_1_2024.xlsx
File Name: Data_Audit_Report_LCG_Assoc_1_2024.xlsx
File Name: Data_Audit_Report_Mercer_1_2024.xlsx
File Name: Data_Audit_Report_Morningstar_1_2024.xlsx


Loading…: 100%|████████████████████████████| 15/15 [00:00<00:00, 24.31it/s]

File Name: Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx
File Name: Data_Audit_Report_PSN_Informa_1_2024.xlsx
File Name: Data_Audit_Report_Wilshire_1_2024.xlsx
Complete.





In [7]:
# Loading a sample of the names list obtained from the selected folder:
file_names

['Data_Audit_Report_Albourne_Moatspace_1_2024.xlsx',
 'Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx',
 'Data_Audit_Report_Broadridge_1_2024.xlsx',
 'Data_Audit_Report_Callan_Assoc_1_2024.xlsx',
 'Data_Audit_Report_Camradata_1_2024.xlsx',
 'Data_Audit_Report_eVestment_Alliance_1_2024.xlsx',
 'Data_Audit_Report_Global_Fund_Search_1_2024.xlsx',
 'Data_Audit_Report_Global_Manager_Research_1_2024.xlsx',
 'Data_Audit_Report_Investment_Metrics_1_2024.xlsx',
 'Data_Audit_Report_LCG_Assoc_1_2024.xlsx',
 'Data_Audit_Report_Mercer_1_2024.xlsx',
 'Data_Audit_Report_Morningstar_1_2024.xlsx',
 'Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx',
 'Data_Audit_Report_PSN_Informa_1_2024.xlsx',
 'Data_Audit_Report_Wilshire_1_2024.xlsx']

In [101]:
excel_file_content = pd.read_excel(file_path+'/'+file_names[3])

In [70]:
excel_file_content.iloc[3]

Unnamed: 0                             Firm:
Unnamed: 1    Manulife Investment Management
Unnamed: 2                               NaN
Name: 3, dtype: object

In [118]:
final_dict = []
nodata_ = []

print('Checking file name: ', file_names[3])
# This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
excel_file_content = pd.read_excel(file_path+'/'+file_names[3]) 
# For loop to select the sheet name (vehicle):
for j in range(len(sheet_names)):
    # Will do a try and except since there are sheets that don't exist in the files, so the code doesn't crash:
    try:
        print('Checking sheet name: ', sheet_names[3])
        # Defining the Excel file to be openned and the sheet we need from the book:
        excel_file_orig = pd.read_excel(file_path+'/'+file_names[3], sheet_name=sheet_names[3])
    except:
    # If sheet is not found then let's try this so the code can continue:
        print('No sheet found for the vehicle {}'.format(sheet_names[3]))
        dict_ = {'Database': excel_file_content.iloc[4][1],            # Database name e.g. "Wilshire"
                 sheet_names[3]: "No audit data generated.",        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
                } 
        nodata_.append(dict_) # Adding the respective database and vehicle name that does not exist to list
        output_df_ = pd.DataFrame(nodata_).groupby(['Database']).sum() # Grouping dataframe by database
        continue
    # Selecting the header names placed in row 7:
    excel_file_orig.rename(columns = excel_file_orig.iloc[7], inplace= True)
    # Selecting the rows with data and reseting the index:
    excel_file = excel_file_orig[7:][1:].set_index(['Date'], drop=True)    
    # We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
    excel_file.index = pd.to_datetime(excel_file.index)
    # Selecting data in the dataframe by the correct date:
    excel_file = excel_file[~(excel_file.index < '09/2022')]
    # Setting up the correct format for the index/Date column
    excel_file.index = excel_file.index.strftime("%m/%Y")
    # Dropping rows and columns in which all the cells contain NaN values:
    excel_file = excel_file.dropna(how='all', axis=0).dropna(how='all', axis=1)
    # Removing commas and $ signs in the numerical values:
    #excel_file = excel_file.replace(, regex=True).replace('Million', '').replace('million', '')
    # Creating a for loop to assign dummy variables to the Data Gap Auditor report:


Checking file name:  Data_Audit_Report_Callan_Assoc_1_2024.xlsx
Checking sheet name:  P121285
Checking sheet name:  P121285
Checking sheet name:  P121285
Checking sheet name:  P121285
Checking sheet name:  P121285
Checking sheet name:  P121285


In [126]:
for n in range(0, excel_file.shape[1]):
    for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
            # Avoiding code crashes using try/except:    
        try:
            if (int(float(p) >= 0)) or (int(float(p) <= 0)):
                            excel_file[excel_file.columns[n]][m] = '0'  # "Complete"
        except:
            if re.match(r'(-?[0-9\.]+) \s*/ (-?[0-9\.]+)', p):
                  print('aaaaaa')

aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa
aaaaaa


In [12]:
a = excel_file['AA/Aa Current']

In [23]:
b = a.replace(' %','', regex=True)
b

Date
12/2023        0 / <NO DATA>
12/2023                  NaN
09/2023    <NO APX> / 102.58
06/2023                    0
03/2023                    0
12/2022                    0
09/2022                    0
Name: AA/Aa Current, dtype: object

In [26]:
b.replace(r'<NO APX> \s*/ (-?[0-9\.]+)','', regex=True)

Date
12/2023    0 / <NO DATA>
12/2023              NaN
09/2023                 
06/2023                0
03/2023                0
12/2022                0
09/2022                0
Name: AA/Aa Current, dtype: object

In [19]:
final_dict = []
nodata_ = []

for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 

    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
        # Will do a try and except since there are sheets that don't exist in the files, so the code doesn't crash:
        try:
            print('Checking sheet name: ', sheet_names[j])
            # Defining the Excel file to be openned and the sheet we need from the book:
            excel_file_orig = pd.read_excel(file_path+'/'+file_names[i], sheet_name=sheet_names[j])
        except:
        # If sheet is not found then let's try this so the code can continue:
            print('No sheet found for the vehicle {}'.format(sheet_names[j]))
            dict_ = {'Database': excel_file_content.iloc[4][1],            # Database name e.g. "Wilshire"
                     sheet_names[j]: "No audit data generated.",        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
                    } 
            nodata_.append(dict_) # Adding the respective database and vehicle name that does not exist to list
            output_df_ = pd.DataFrame(nodata_).groupby(['Database']).sum() # Grouping dataframe by database
            continue

        # Selecting the header names placed in row 7:
        excel_file_orig.rename(columns = excel_file_orig.iloc[7], inplace= True)
        # Selecting the rows with data and reseting the index:
        excel_file = excel_file_orig[7:][1:].set_index(['Date'], drop=True)    
        # We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
        excel_file.index = pd.to_datetime(excel_file.index)
        # Selecting data in the dataframe by the correct date:
        excel_file = excel_file[~(excel_file.index < '09/2022')]
        # Setting up the correct format for the index/Date column
        excel_file.index = excel_file.index.strftime("%m/%Y")
        # Dropping rows and columns in which all the cells contain NaN values:
        excel_file = excel_file.dropna(how='all', axis=0).dropna(how='all', axis=1)
        # Removing commas and $ signs in the numerical values:
        #excel_file = excel_file.replace(, regex=True).replace('Million', '').replace('million', '')

        # Creating a for loop to assign dummy variables to the Data Gap Auditor report:
        for n in range(0, excel_file.shape[1]):
            for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
                    # Avoiding code crashes using try/except:    
                try:
                    if (int(float(p) >= 0)) or (int(float(p) <= 0)):
                                    excel_file[excel_file.columns[n]][m] = '0'  # "Complete"
                except:
                    # APX FILE if (' / <NO DATA>') in p:
                    # APX FILE     excel_file[excel_file.columns[n]][m] = '1'
                    # APX FILE else:
                    # APX FILE    excel_file[excel_file.columns[n]][m] = ''
                     
                    if ('<NO APX> / ') in p:
                         excel_file[excel_file.columns[n]][m] = '2'     # "Data not in the Vault" // Client could want APX to distribute this data for them
                    elif re.match(r'(-?[0-9\.]+) \s*/ (-?[0-9\.]+)', p):
                         excel_file[excel_file.columns[n]][m] = '3'     # "Data not matching" // APX needs to review this data until it matches/is Complete  
                    else:
                         excel_file[excel_file.columns[n]][m] = ''
                     
                        #if (r'(-?[0-9\.]+)\s*/ <NO DATA>') in p:
                        #    excel_file[excel_file.columns[n]][m] = '1' # "Data not in the database" // APX needs to distribute this data
                        #elif "<NO APX> / " in p:
                        #    excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '2') # "Data not in the Vault" // Client could want APX to distribute this data for them
                        #elif " / " in p:
                        #    excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '3') # "Data not matching" // APX needs to review this data until it matches/is Complete     
                        #else:
                        #    excel_file[excel_file.columns[n]][m] = ''  # If the cell does not contain any of this criteria above, then it's not relevant for our analysis/review
        #excel_file = excel_file.replace(',','', regex=True).replace(' %','', regex=True).replace('-','', regex=True)
        #excel_file = excel_file.replace({
         #                               r'(-?[0-9\.]+) \s*/ <NO DATA>': '1',                 # Data not in database
         #                               r'[a-zA-Z0-9_.-] \s*/ <NO DATA>': '1',
         #                               '<NO DATA>': '1', 
         #                               r'<NO APX> \s*/ (-?[0-9\.]+)': '2',                 # Data not in Vault
         #                               r'<NO APX> \s*/ [a-zA-Z0-9_.-]': '2',
         #                               '<NO APX>': '2',
         #                               r'(-?[0-9\.]+) \s*/ (-?[0-9\.]+)': '3',              # Data not matching
         #                               ' / ':'0',
         #                               r"[-%()\"#/@$;:<>{}`+=~|.!?,]":''}, regex=True)
        
        for n in range(0, excel_file.shape[1]):
            for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
                
                 if (p != '1') and (p != '2') and (p != '3') and (p != '0'):
                      excel_file[excel_file.columns[n]][m] = ''
                 #if not ('2' in p):
                 #     excel_file[excel_file.columns[n]][m] = ''
                 #if not ('3' in p):
                 #     excel_file[excel_file.columns[n]][m] = ''
                 #if not ('0' in p):
                 #     excel_file[excel_file.columns[n]][m] = ''

            # Let's fill the NaN values for easier further processes:
        excel_file.fillna('', inplace=True)
            # Putting the dummy variables in a single column named 'Review':
        excel_file['Review'] = excel_file[excel_file.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1)

        for m,p in enumerate(excel_file['Review']):


                        #if all('0' in k for k in p):
                        #    excel_file['Review'][m] = excel_file['Review'][m].replace(p, '')

                    #if any('1' in k for k in p):
                    #    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 1')          # APX needs to distribute this data

                        #elif (all('1' in k for k in p)):
                        #    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 1')     # Data not in the Vault  
                        if any('2' in k for k in p):
                            excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 2')      # Client could want APX to distribute this data for them

                        elif any('3' in k for k in p):
                            excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 3')      # Data not matching
                  
                             

        excel_file.to_excel(r'C:\Users\l.arguello\Documents\Python Scripts\APX_automation_reports\output\data_auditor_review\{}_{}_sheet{}.xlsx'.format(file_names[i], dataset, sheet_names[j]))
        #periods_1 = []
        periods_2 = []       
        periods_3 = []
        description = []
        for m,p in enumerate(zip(excel_file['Review'],excel_file.index)):

        #if p[0] == 'Complete':
        #    periods_0.append(p[1])
             
        # APX FILE     if p[0] == 'Priority 1':
        # APX FILE         periods_1.append(p[1])
        # APX FILE     elif periods_1 == periods_1:
        # APX FILE         description.append("".format((list(set(periods_1)))).replace("'",'').replace('[','').replace(']',''))    
        # APX FILE periods_1 = list(set(periods_1))

            if p[0] == 'Priority 2':
                 periods_2.append(p[1])
            if p[0] == 'Priority 3':
                 periods_3.append(p[1])

            elif periods_2 == periods_2:
                 description.append("".format((list(set(periods_2)))).replace("'",'').replace('[','').replace(']',''))    
            elif periods_3 == periods_3:
                 description.append("".format((list(set(periods_3)))).replace("'",'').replace('[','').replace(']',''))  
        periods_2 = list(set(periods_2))
        periods_3 = list(set(periods_3))


            #elif p[0] == 'Priority 2':
            #    periods_2.append(p[1])
            #elif p[0] == 'Priority 3':
            #    periods_3.append(p[1])
        #0 "Complete"
        #1 "Data not in the Vault"
        #2 "Data not in the database"
        #3 "Data not matching"    
        # A description list is created to put in the final review without considering empty period lists:
        # if periods_0 := periods_0: description.append("✔ Complete for the periods: {}\n".format((list(set(periods_0)))).replace("'",'').replace('[','').replace(']',''))
        # if periods_1 := periods_1: description.append("● Priority 1: {}\n".format((list(set(periods_1)))).replace("'",'').replace('[','').replace(']',''))
        if periods_2 := periods_2: description.append("● Priority 2: {}\n".format(list(set((periods_2)))).replace("'",'').replace('[','').replace(']',''))
        if periods_3 := periods_3: description.append("● Priority 3: {}\n".format((list(set(periods_3)))).replace("'",'').replace('[','').replace(']',''))  
        description = list(set(description))

        excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
        # Building the dictionary to then transform it into a dataframe:
        dict = {'Database': excel_file_content.iloc[4][1],      # Database name e.g. "Wilshire"
                excel_file_orig.iloc[6][1]: description,        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)
                }  
        # Creating a new dataframe that will sum up the findings in the Data Auditor        
        output_df = pd.DataFrame([dict])
        # Putting each description in a single line (this may duplicate the database name):
        output_df0 = output_df.explode(excel_file_orig.iloc[6][1])
        # Final dict
        final_dict.append(output_df0)

Checking file name:  Data_Audit_Report_Albourne_Moatspace_1_2024.xlsx
Checking sheet name:  P73285
No sheet found for the vehicle P73285
Checking sheet name:  P74285
No sheet found for the vehicle P74285
Checking sheet name:  P85285
No sheet found for the vehicle P85285
Checking sheet name:  P121285
No sheet found for the vehicle P121285
Checking sheet name:  P126285
No sheet found for the vehicle P126285
Checking sheet name:  P147285
No sheet found for the vehicle P147285
Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
No sheet found for the vehicle P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
No sheet found for the vehicle P126285
Checking sheet name:  P147285
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
Checking sheet name:  P73285
No sheet found for the vehicle P73285
Checking sheet name:  P74285
No sheet found for the vehicle P74

In [57]:
import re
string='23.34 / 34.45 %'
if re.match(r'[a-zA-Z-_.-]', string):
    print(string.replace(string, 'a1'))
else:
    print(string.replace(string, 'a0'))

a0


In [59]:
string.replace('%', '')

'23.34 / 34.45 '

In [20]:
a = pd.concat(final_dict)
a = a.reindex(sorted(a.columns), axis=1)
a.set_index("Database", drop=True, inplace=True)
a

Unnamed: 0_level_0,Core Fixed Income Composite (P73285),Core Plus Fixed Income Composite (P74285),Global Quality Value Composite (P85285),Strategic Fixed Income Composite (P121285),Strategic Fixed Income Opportunities Composite (P126285),U.S Small-Cap Core Composite (P147285)
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alpha Portfolio Advisors,,,,,,
Alpha Portfolio Advisors,● Priority 2: 12/2022\n,,,,,
Alpha Portfolio Advisors,,,,,,
Alpha Portfolio Advisors,,,● Priority 2: 12/2022\n,,,
Alpha Portfolio Advisors,,,,,,
Alpha Portfolio Advisors,,,,● Priority 2: 12/2022\n,,
Alpha Portfolio Advisors,,,,,,
Alpha Portfolio Advisors,,,,,,● Priority 2: 12/2022\n
Callan Assoc.,,,,,,
Callan Assoc.,"● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...",,,,,


In [21]:
output_df_

Unnamed: 0_level_0,P73285,P74285,P85285,P121285,P126285,P147285
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Albourne Moatspace,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Alpha Portfolio Advisors,0,No audit data generated.,0,0,No audit data generated.,0
Broadridge,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Camradata,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Fund Search,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Manager Research,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
LCG Assoc.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Mercer,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Morningstar,0,0,No audit data generated.,0,0,No audit data generated.
Preqin Hedge Fund Analyst,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.


In [22]:
for col1 in output_df_.columns:
    for col2 in a.columns:
        if col1 in col2:
            output_df_.columns = a.columns

In [23]:
output_df_

Unnamed: 0_level_0,Core Fixed Income Composite (P73285),Core Plus Fixed Income Composite (P74285),Global Quality Value Composite (P85285),Strategic Fixed Income Composite (P121285),Strategic Fixed Income Opportunities Composite (P126285),U.S Small-Cap Core Composite (P147285)
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Albourne Moatspace,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Alpha Portfolio Advisors,0,No audit data generated.,0,0,No audit data generated.,0
Broadridge,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Camradata,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Fund Search,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Manager Research,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
LCG Assoc.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Mercer,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Morningstar,0,0,No audit data generated.,0,0,No audit data generated.
Preqin Hedge Fund Analyst,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.


In [24]:
review_file = pd.merge(a, output_df_, on='Database', how='outer').groupby('Database').sum()

In [25]:
review_file

Unnamed: 0_level_0,Core Fixed Income Composite (P73285)_x,Core Plus Fixed Income Composite (P74285)_x,Global Quality Value Composite (P85285)_x,Strategic Fixed Income Composite (P121285)_x,Strategic Fixed Income Opportunities Composite (P126285)_x,U.S Small-Cap Core Composite (P147285)_x,Core Fixed Income Composite (P73285)_y,Core Plus Fixed Income Composite (P74285)_y,Global Quality Value Composite (P85285)_y,Strategic Fixed Income Composite (P121285)_y,Strategic Fixed Income Opportunities Composite (P126285)_y,U.S Small-Cap Core Composite (P147285)_y
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Albourne Moatspace,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Alpha Portfolio Advisors,● Priority 2: 12/2022\n,0,● Priority 2: 12/2022\n,● Priority 2: 12/2022\n,0,● Priority 2: 12/2022\n,0,No audit data generated.No audit data generate...,0,0,No audit data generated.No audit data generate...,0
Broadridge,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Callan Assoc.,"● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 09/2022, 09/2023\n● Pri...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 03/2023, 09/2022, 09/2023\n● Pri...",0,0,0,0,0,0
Camradata,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Fund Search,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Manager Research,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Investment Metrics,"● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 09/2022\n","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...",0,0,0,0,0,0
LCG Assoc.,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Mercer,0,0,0,0,0,0,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.


In [26]:
review_file.columns = review_file.columns.str.rstrip("_x")
review_file = review_file.drop([x for x in review_file if x.endswith('_y')], axis = 1)
review_file = review_file.replace(0, "No audit data generated.", regex=True)
review_file

Unnamed: 0_level_0,Core Fixed Income Composite (P73285),Core Plus Fixed Income Composite (P74285),Global Quality Value Composite (P85285),Strategic Fixed Income Composite (P121285),Strategic Fixed Income Opportunities Composite (P126285),U.S Small-Cap Core Composite (P147285)
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Albourne Moatspace,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Alpha Portfolio Advisors,● Priority 2: 12/2022\n,No audit data generated.,● Priority 2: 12/2022\n,● Priority 2: 12/2022\n,No audit data generated.,● Priority 2: 12/2022\n
Broadridge,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Callan Assoc.,"● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 09/2022, 09/2023\n● Pri...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 03/2023, 09/2022, 09/2023\n● Pri..."
Camradata,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Fund Search,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Global Manager Research,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Investment Metrics,"● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 09/2022\n","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20...","● Priority 2: 12/2022, 06/2023, 03/2023, 09/20..."
LCG Assoc.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.
Mercer,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.,No audit data generated.


Checking for the unique values in each of the columns, since they're not numeric, to start the data cleaning:

In [23]:
pd.options.display.max_rows = 999

In [29]:
for col in excel_file:
    print(excel_file[col].unique())

[nan '0 / <NO DATA>' '94.80 / <NO DATA>' '16.37 / <NO DATA>'
 '8.62 / <NO DATA>' '100 / <NO DATA>']
[nan '0 / <NO DATA>' '100 / <NO DATA>' '73.00 / <NO DATA>'
 '2.05 / <NO DATA>' '15.02 / <NO DATA>']
[nan '2.93 / <NO DATA>' '0 / <NO DATA>' '19.92 / <NO DATA>'
 '13.49 / <NO DATA>' '1.10 / <NO DATA>' '100 / <NO DATA>']
[nan '4081.00 / <NO DATA>' '0 / <NO DATA>' '100 / <NO DATA>'
 '11.47 / <NO DATA>' '14.73 / <NO DATA>' '2.39 / <NO DATA>']
['0 / <NO DATA>' nan '3260.00 / <NO DATA>' '2.15 / <NO DATA>'
 '22.71 / <NO DATA>' '4.54 / <NO DATA>' '100 / <NO DATA>']
['0 / <NO DATA>' nan '96.52 / <NO DATA>' '66.68 / <NO DATA>'
 '100 / <NO DATA>' '7.38 / <NO DATA>' '4.89 / <NO DATA>']
['0 / <NO DATA>' nan '95.19 / <NO DATA>' '10.76 / <NO DATA>'
 '8.76 / <NO DATA>' '100 / <NO DATA>']
['0 / <NO DATA>' nan '100 / <NO DATA>' '76.00 / <NO DATA>'
 '4.02 / <NO DATA>' '14.95 / <NO DATA>']
['0 / <NO DATA>' nan '3.48 / <NO DATA>' '18.20 / <NO DATA>'
 '4.76 / <NO DATA>' '0.83 / <NO DATA>' '100 / <NO DATA>']
[

In [87]:
pd.concat(final_dict) 

Unnamed: 0,Database,Core Fixed Income Composite (P73285),Global Quality Value Composite (P85285),Strategic Fixed Income Composite (P121285),U.S Small-Cap Core Composite (P147285),Core Plus Fixed Income Composite (P74285),Strategic Fixed Income Opportunities Composite (P126285)
0,Alpha Portfolio Advisors,● Priority 1: 12/2022\n,,,,,
0,Alpha Portfolio Advisors,,,,,,
0,Alpha Portfolio Advisors,,,,,,
0,Alpha Portfolio Advisors,,,,,,
0,Callan Assoc.,,,,,,
0,Callan Assoc.,"● Priority 1: 12/2023, 12/2022, 09/2022\n",,,,,
0,Callan Assoc.,,,,,,
0,Callan Assoc.,,,,,"● Priority 1: 12/2023, 12/2022, 09/2022\n",
0,Callan Assoc.,,,,,,
0,Callan Assoc.,,"● Priority 1: 12/2023, 03/2023, 06/2023, 12/20...",,,,


In [89]:
###############
# Transforming into a dataframe the last dictionary with the review description:
final_dict_ = pd.concat(final_dict) 
    # final_dict_ has a numerical index, whilst output_df_ has databases as its index, so we'll arrange that:
final_dict_.set_index("Database", drop=True, inplace=True)
final_dict = pd.DataFrame(final_dict)    

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (32,) + inhomogeneous part.

In [81]:
final_dict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,Database Core Fixed Income ...,Database Global Quality Val...,Database Strategic Fixed In...,Database U.S Small-Cap Core...,Database Core Fixed Income Compo...,Database Core Plus Fixed Income Compo...,Database Global Quality Val...,Database Strategic Fixed Incom...,Database Strategic Fixed Income Opport...,Database U.S Small-Cap Core Compos...,...,Database Core Fixed Income Composite (P7...,Database Core Plus Fixed Income Composit...,Database Strategic Fixed Income Composit...,Database Strategic Fixed Income Opportun...,Database Core Fixed Income...,Database Core Plus Fixed Income...,Database Global Quality Value Composite ...,Database Strategic Fixed Income Composit...,Database Strategic Fixed Income Opportun...,Database U.S Small-Cap Core ...


In [59]:
output_df

Unnamed: 0,Database,U.S Small-Cap Core Composite (P147285)
0,Wilshire,"[● Priority 1: 12/2023, 07/2023, 12/2022, 11/2..."


In [75]:
review_file = pd.merge(final_dict, output_df, on='Database', how='outer').groupby('Database').sum()
#elif output_df_ is not None:
        #final_dict_ = pd.DataFrame([final_dict_])
        #output_df_ = pd.DataFrame([output_df_])
#        review_file = pd.merge(final_dict_, output_df_, on='Database', how='outer').groupby('Database').sum()
    # Dropping unnecessary columns and replacing zero values with the description "No audit data generated":
review_file.columns = review_file.columns.str.rstrip("_x")
review_file = review_file.drop([x for x in review_file if x.endswith("_y")], axis = 1)
review_file = review_file.replace(0, "No audit data generated.", regex=True)
# Sorting column names and Database names:
review_file = review_file.reindex(sorted(review_file.columns), axis=1)
# Sorting index alphabetically (case insensitive):
review_file = review_file.reindex(index=(sorted(review_file.index, key=lambda s: s.lower())))
# Making sure index name doesn't get lost:
review_file.index.name = 'Database'
# Output path with respective name:
excel_output = r'C:\Users\l.arguello\Documents\Python Scripts\APX_automation_reports\output\data_auditor_review\DataAuditor_review_{}_APX.xlsx'.format(dataset)
# ********** excel_output = r'E:\Users\LauraMelissa\Downloads\apx\output\DataAuditor_review_{}_APX.xlsx'.format(dataset)
# Adding legend/keys table:
legend_dict = {'Priority 1': "",       
                }
legend_keys = pd.DataFrame([legend_dict])
legend_keys = legend_keys.set_axis(['Legend'], axis='index').transpose()

KeyError: 'Database'

In [48]:
# Creating a for loop to assign dummy variables to the Data Gap Auditor report:
for n in range(0, excel_file.shape[1]):

    for i,j in enumerate(excel_file[excel_file.columns[(n)]]):
        
        
        try:
            if float(j) >= 0 or float(j) <= 0:

                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '0') # "Complete"
                
        except:
            if "<NO APX> / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '1') # "Data not in the Vault"
            elif " / <NO DATA>" in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '2') # "Data not in the database"
            elif " / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '3') # "Data not matching"  
            else:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '')  # If the cell does not contain any of this criteria above, then it's not relevant for our analysis/reviewal


In [49]:
# Let's fill the NaN values for easier further processes:
excel_file.fillna('', inplace=True)

In [50]:
excel_file

Unnamed: 0_level_0,Taxable Accounts,Taxable Assets,Tax-Exempt Accounts,Tax-Exempt Assets,Institutional or High Net Worth Separate Accounts,Institutional or High Net Worth Separate Assets,Managed Account (WRAP) Accounts,Managed Account (WRAP) Assets,Multiple Strategy Portfolio Accounts,Multiple Strategy Portfolio Assets,...,Other Assets,Accounts Gained,Assets Gained,Accounts Lost,Assets Lost,Median Tax-Exempt Account ($Millions),Median Taxable Account ($Millions),Median Account ($Millions),Total Number of Clients,$MM in Composite
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
11/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
10/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
09/2023,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,,3.0,3.0,1.0,1.0,1.0,0.0,3.0
08/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
07/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
06/2023,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,,0.0,0.0,1.0,1.0,1.0,0.0,3.0
05/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
04/2023,2,2,2,2,,,,,,,...,,,,,,,,,,
03/2023,3,3,3,3,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,,,,,1.0,1.0,1.0,1.0,1.0


In [51]:
# Putting the dummy variables in a single column:
excel_file['Review'] = excel_file[excel_file.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1)
# Load a sample of how it looks like at the moment:
excel_file.head()

Unnamed: 0_level_0,Taxable Accounts,Taxable Assets,Tax-Exempt Accounts,Tax-Exempt Assets,Institutional or High Net Worth Separate Accounts,Institutional or High Net Worth Separate Assets,Managed Account (WRAP) Accounts,Managed Account (WRAP) Assets,Multiple Strategy Portfolio Accounts,Multiple Strategy Portfolio Assets,...,Accounts Gained,Assets Gained,Accounts Lost,Assets Lost,Median Tax-Exempt Account ($Millions),Median Taxable Account ($Millions),Median Account ($Millions),Total Number of Clients,$MM in Composite,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/2023,2,2,2,2,,,,,,,...,,,,,,,,,,2222
11/2023,2,2,2,2,,,,,,,...,,,,,,,,,,2222
10/2023,2,2,2,2,,,,,,,...,,,,,,,,,,2222
09/2023,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,3.0,3.0,1.0,1.0,1.0,0.0,3.0,3311103
08/2023,2,2,2,2,,,,,,,...,,,,,,,,,,2222


In [52]:
excel_file['Review']

Date
12/2023                           2222
11/2023                           2222
10/2023                           2222
09/2023      0000000000000000003311103
08/2023                           2222
07/2023                           2222
06/2023      0000000000000000000011103
05/2023                           2222
04/2023                           2222
03/2023        33331111111111111111111
02/2023                           2222
01/2023                           2222
12/2022    333311111111111111111111111
11/2022                           2222
10/2022                           2222
09/2022     33331111111111111111111111
Name: Review, dtype: object

In [53]:
# Creating a for loop to assign the correct description to each period:
for i, j in enumerate(excel_file['Review']):
        
        if all('0' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Complete')

        elif all('1' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault')

        elif all('2' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the database')   

        elif all('3' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not matching') 

In [54]:
excel_file['Review']

Date
12/2023       Data not in the database
11/2023       Data not in the database
10/2023       Data not in the database
09/2023      0000000000000000003311103
08/2023       Data not in the database
07/2023       Data not in the database
06/2023      0000000000000000000011103
05/2023       Data not in the database
04/2023       Data not in the database
03/2023        33331111111111111111111
02/2023       Data not in the database
01/2023       Data not in the database
12/2022    333311111111111111111111111
11/2022       Data not in the database
10/2022       Data not in the database
09/2022     33331111111111111111111111
Name: Review, dtype: object

In [57]:
# Now we need to continue to put the other conditions:
for i,j in enumerate(excel_file['Review']):

    if (any('1' in k for k in j) and any('0' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault')
    
    elif (any('2' in k for k in j) and any('0' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the database')

    elif (any('3' in k for k in j) and any('0' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not matching')

    elif (any('3' in k for k in j) and any('1' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault and not matching')
    
    elif (any('2' in k for k in j) and any('1' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault and not in the database')
    
    elif (any('3' in k for k in j) and any('2' in k for k in j) and any('1' in k for k in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault, not matching and not in the database')
    

In [58]:
excel_file['Review']

Date
12/2023                  Data not in the database
11/2023                  Data not in the database
10/2023                  Data not in the database
09/2023                     Data not in the Vault
08/2023                  Data not in the database
07/2023                  Data not in the database
06/2023                     Data not in the Vault
05/2023                  Data not in the database
04/2023                  Data not in the database
03/2023    Data not in the Vault and not matching
02/2023                  Data not in the database
01/2023                  Data not in the database
12/2022    Data not in the Vault and not matching
11/2022                  Data not in the database
10/2022                  Data not in the database
09/2022    Data not in the Vault and not matching
Name: Review, dtype: object

In [25]:
# Creating a list for each of the periods in the Review column:
periods_0 = []
periods_1 = []
periods_2 = []
periods_3 = []

for i,j in enumerate(zip(excel_file['Review'],excel_file.index)):

    if j[0] == 'Complete':
        periods_0.append(j[1])

    elif j[0] == 'Data not in the Vault':
        periods_1.append(j[1])
    
    elif j[0] == 'Data not in the database':
        periods_2.append(j[1])

    elif j[0] == 'Data not matching':
        periods_3.append(j[1])


#0 "Complete"
#1 "Data not in the Vault"
#2 "Data not in the database"
#3 "Data not matching"          

In [27]:
# Sorting the lists by ascending order:
periods_0.sort()
periods_1.sort()
periods_2.sort()
periods_3.sort()

In [28]:
excel_file

Unnamed: 0_level_0,Corporate - Assets - Total Table,Superannuation - Assets - Total Table,Public Fund (Gov) - Assets - Total Table,Union/Multi-Emp - Assets - Total Table,Found&Endow - Assets - Total Table,Health Care - Assets - Total Table,Insurance - Assets - Total Table,High Net Worth - Assets - Total Table,Wrap Accounts - Assets - Total Table,Sub-Advised - Assets - Total Table,...,Range - $10 Million - $100 Million,Total Assets in Range - $10 Million - $100 Million,Total Accounts in Range - $10 Million - $100 Million,Range - $100 Million - $500 Million,Total Assets in Range - $100 Million - $500 Million,Total Accounts in Range - $100 Million - $500 Million,Range - > $500 Million,Total Assets in Range - > $500 Million,Total Accounts in Range - > $500 Million,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12/2023,,,,,,,,,,,...,,,,,,,,,,Complete
09/2023,1.0,,1.0,,,,1.0,,,1.0,...,,1.0,1.0,,1.0,1.0,,1.0,1.0,Data not in the Vault
06/2023,1.0,,1.0,,,,1.0,,,1.0,...,,,,,,,,,,Data not in the Vault
03/2023,1.0,,1.0,,,,1.0,,1.0,1.0,...,,1.0,1.0,,1.0,1.0,,1.0,1.0,Data not in the Vault
12/2022,1.0,,1.0,,,,1.0,,1.0,1.0,...,,1.0,1.0,,1.0,1.0,,1.0,1.0,Data not in the Vault
09/2022,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,1.0,1.0,,1.0,1.0,,1.0,1.0,Data not in the Vault


#### **✨Output DataFrame:**

I'll be now creating a dictionary to compile the information and have the output as a dataframe by gathering the description found for the respective periods so that they can be inspected.

In [29]:
# Loading the first sheet "Table of Contents" to obtain information that can be input into the output dataframe:
excel_file_content = pd.read_excel(file_path+'/'+file_names[3])

In [30]:
# A description list is created to put in the final review without considering empty period lists:
description = []

if periods_1 := periods_1: description.append("✔ Data not in the Vault for the periods: {}".format((periods_1)).replace("'",'').replace('[','').replace(']',''))
if periods_2 := periods_2: description.append("✔ Data not in the database for the periods: {}".format((periods_2)).replace("'",'').replace('[','').replace(']',''))
if periods_3 := periods_3: description.append("✔ Data not matching for the periods: {}".format((periods_3)).replace("'",'').replace('[','').replace(']',''))

In [31]:
# Checking a sample of the final description:
description

['✔ Data not in the Vault for the periods: 03/2023, 06/2023, 09/2022, 09/2023, 12/2022']

In [32]:
# Building the dictionary to then transform it into a dataframe:

dict = {'Database': excel_file_content.iloc[4][1],      # Database name e.g. "Wilshire"
        excel_file_orig.iloc[6][1]: description,                  # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
        }                                               

In [33]:
# Creating a new dataframe that will sum up the findings in the Data Auditor:
output_df = pd.DataFrame([dict])

In [35]:
# Putting each description in a single line (this may duplicate the database name):
output_df0 = output_df.explode(excel_file_orig.iloc[6][1])

# Setting the column width to the max so the whole line can be read:
pd.set_option('display.max_colwidth', -1)

# Sample of the final review:
output_df0

Unnamed: 0,Database,Core Fixed Income Composite (P73285)
0,Callan Assoc.,"✔ Data not in the Vault for the periods: 03/2023, 06/2023, 09/2022, 09/2023, 12/2022"


### Checking code for individual files

**APX**

In [215]:
# This script will generate an ouput Excel file for APX user.

# This Python code will review the original Data Gap Auditor files generated from the Vault, 
# and will sum up what has been found in those files.
# The code will work to create 2 separate Excel files and will loop through each dataset 
# to be worked on: AUM, Performance, Holdings, and Characteristics.

# The code will then work on that dataset and will create an output Excel file for for each user (APX and Client)
# that will contain a table summing up all the findings. It will also include
# all databases and the products/vehicles stated in the sheet_names variable.

# All the original files are put into the main folder: Manulife_DataAuditor, within it, 
# the folder contains four (4) folders for each dataset separately and within each, 
# the Data_Audit_Report Excel files for each database downloaded from the Vault
# Here's an example of main folder organization for this code: Manulife_DataAuditor/Performance/Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx

# Importing Python libraries that will be used:
import os
import glob 
import pandas as pd
from tqdm import tqdm
import time

import warnings
warnings.filterwarnings("ignore")

start_time = time.time()


# Setting dataset types:
datasets = ['AUM', 'Performance', 'Holdings', 'Characteristics']


print('Reviewing for the user: APX')

#for dataset in datasets:
# Vehicles that we're interested in are being listed here:
# Core Fixed Income	                    12776	P73285
# Core Plus Fixed Income	            12777	P74285 
# Global Quality Value	                12783	P85285
# Strategic Fixed Income	            12811	P121285
# Strategic Fixed Income Opportunities	12812	P126285
# US Small Cap Core	                    12823	P147285
sheet_names = ['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']
print('Checking dataset ')
# Setting file path. We'll be opening first the Performance folder:
absolute_path = "C:/Users/l.arguello/Downloads/Manulife_DataAuditor/"
# Full file path:
file_path = absolute_path + 'AUM'
# Using glob to get all the Excel file names in the selected folder, to loop through them:
csv_files = glob.glob(os.path.join(file_path, "[!~]*.xlsx")) # [!~] to ignore temporary/opened files
# Empty list to store file names from folder:
file_names = []
# Loop over the list of Excel files: 
for f in tqdm(csv_files, desc="Loading…",ascii=False, ncols=75):        
        time.sleep(0.03) 
        # Print the location and filename 
        print('File Name:', f.split("\\")[-1]) 
        # Add each Excel file name to file_names list 
        file_names.append(f.split("\\")[-1])      
print("Complete.")

Reviewing for the user: APX
Checking dataset 


Loading…:   0%|                                     | 0/14 [00:00<?, ?it/s]

Loading…:  29%|████████▎                    | 4/14 [00:00<00:00, 29.43it/s]

File Name: Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
File Name: Data_Audit_Report_Broadridge_1_2024.xlsx
File Name: Data_Audit_Report_Callan_Assoc_1_2024.xlsx
File Name: Data_Audit_Report_Camradata_1_2024.xlsx
File Name: Data_Audit_Report_eVestment_Alliance_1_2024.xlsx
File Name: Data_Audit_Report_Global_Fund_Search_1_2024.xlsx


Loading…:  50%|██████████████▌              | 7/14 [00:00<00:00, 28.89it/s]

File Name: Data_Audit_Report_Global_Manager_Research_1_2024.xlsx
File Name: Data_Audit_Report_Investment_Metrics_1_2024.xlsx
File Name: Data_Audit_Report_LCG_Assoc_1_2024.xlsx


Loading…:  71%|████████████████████        | 10/14 [00:00<00:00, 27.37it/s]

File Name: Data_Audit_Report_Mercer_1_2024.xlsx
File Name: Data_Audit_Report_Morningstar_1_2024.xlsx
File Name: Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx


Loading…: 100%|████████████████████████████| 14/14 [00:00<00:00, 27.76it/s]

File Name: Data_Audit_Report_PSN_Informa_1_2024.xlsx
File Name: Data_Audit_Report_Wilshire_1_2024.xlsx
Complete.





In [95]:
file_names

['Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx',
 'Data_Audit_Report_Broadridge_1_2024.xlsx',
 'Data_Audit_Report_Callan_Assoc_1_2024.xlsx',
 'Data_Audit_Report_Camradata_1_2024.xlsx',
 'Data_Audit_Report_eVestment_Alliance_1_2024.xlsx',
 'Data_Audit_Report_Global_Fund_Search_1_2024.xlsx',
 'Data_Audit_Report_Global_Manager_Research_1_2024.xlsx',
 'Data_Audit_Report_Investment_Metrics_1_2024.xlsx',
 'Data_Audit_Report_LCG_Assoc_1_2024.xlsx',
 'Data_Audit_Report_Mercer_1_2024.xlsx',
 'Data_Audit_Report_Morningstar_1_2024.xlsx',
 'Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx',
 'Data_Audit_Report_PSN_Informa_1_2024.xlsx',
 'Data_Audit_Report_Wilshire_1_2024.xlsx']

In [96]:
sheet_names

['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']

In [216]:
excel_file

Unnamed: 0_level_0,Corporate - Assets - Total Table,Superannuation - Assets - Total Table,Public Fund (Gov) - Assets - Total Table,Union/Multi-Emp - Assets - Total Table,Found&Endow - Assets - Total Table,Health Care - Assets - Total Table,Insurance - Assets - Total Table,High Net Worth - Assets - Total Table,Wrap Accounts - Assets - Total Table,Sub-Advised - Assets - Total Table,Supranationals - Assets - Total Table,Sov Wealth Funds - Assets - Total Table,Other - Assets - Total Table,Calculated Total - Assets - Total Table,Corporate - Accounts - Total Table,Superannuation - Accounts - Total Table,Public Fund (Gov) - Accounts - Total Table,Union/Multi-Emp - Accounts - Total Table,Found&Endow - Accounts - Total Table,Health Care - Accounts - Total Table,Insurance - Accounts - Total Table,High Net Worth - Accounts - Total Table,Wrap Accounts - Accounts - Total Table,Sub-Advised - Accounts - Total Table,Supranationals - Accounts - Total Table,Sov Wealth Funds - Accounts - Total Table,Other - Accounts - Total Table,Calculated Total - Accounts - Total Table,Corporate - Assets - Taxable Table,Superannuation - Assets - Taxable Table,Public Fund (Gov) - Assets - Taxable Table,Union/Multi-Emp - Assets - Taxable Table,Found&Endow - Assets - Taxable Table,Health Care - Assets - Taxable Table,Insurance - Assets - Taxable Table,High Net Worth - Assets - Taxable Table,Wrap Accounts - Assets - Taxable Table,Sub-Advised - Assets - Taxable Table,Supranationals - Assets - Taxable Table,Sov Wealth Funds - Assets - Taxable Table,Other - Assets - Taxable Table,Calculated Total - Assets - Taxable Table,Corporate - Accounts - Taxable Table,Superannuation - Accounts - Taxable Table,Public Fund (Gov) - Accounts - Taxable Table,Union/Multi-Emp - Accounts - Taxable Table,Found&Endow - Accounts - Taxable Table,Health Care - Accounts - Taxable Table,Insurance - Accounts - Taxable Table,High Net Worth - Accounts - Taxable Table,Wrap Accounts - Accounts - Taxable Table,Sub-Advised - Accounts - Taxable Table,Supranationals - Accounts - Taxable Table,Sov Wealth Funds - Accounts - Taxable Table,Other - Accounts - Taxable Table,Calculated Total - Accounts - Taxable Table,Corporate - Assets - Tax Exempt Table,Superannuation - Assets - Tax Exempt Table,Public Fund (Gov) - Assets - Tax Exempt Table,Union/Multi-Emp - Assets - Tax Exempt Table,Found&Endow - Assets - Tax Exempt Table,Health Care - Assets - Tax Exempt Table,Insurance - Assets - Tax Exempt Table,High Net Worth - Assets - Tax Exempt Table,Wrap Accounts - Assets - Tax Exempt Table,Sub-Advised - Assets - Tax Exempt Table,Supranationals - Assets - Tax Exempt Table,Sov Wealth Funds - Assets - Tax Exempt Table,Other - Assets - Tax Exempt Table,Calculated Total - Assets - Tax Exempt Table,Corporate - Accounts - Tax Exempt Table,Superannuation - Accounts - Tax Exempt Table,Public Fund (Gov) - Accounts - Tax Exempt Table,Union/Multi-Emp - Accounts - Tax Exempt Table,Found&Endow - Accounts - Tax Exempt Table,Health Care - Accounts - Tax Exempt Table,Insurance - Accounts - Tax Exempt Table,High Net Worth - Accounts - Tax Exempt Table,Wrap Accounts - Accounts - Tax Exempt Table,Sub-Advised - Accounts - Tax Exempt Table,Supranationals - Accounts - Tax Exempt Table,Sov Wealth Funds - Accounts - Tax Exempt Table,Other - Accounts - Tax Exempt Table,Calculated Total - Accounts - Tax Exempt Table,Corporate - Assets - Institutional Table,Superannuation - Assets - Institutional Table,Public Fund (Gov) - Assets - Institutional Table,Union/Multi-Emp - Assets - Institutional Table,Found&Endow - Assets - Institutional Table,Health Care - Assets - Institutional Table,Insurance - Assets - Institutional Table,High Net Worth - Assets - Institutional Table,Wrap Accounts - Assets - Institutional Table,Sub-Advised - Assets - Institutional Table,Supranationals - Assets - Institutional Table,Sov Wealth Funds - Assets - Institutional Table,Other - Assets - Institutional Table,Calculated Total - Assets - Institutional Table,Corporate - Accounts - Institutional Table,Superannuation - Accounts - Institutional Table,Public Fund (Gov) - Accounts - Institutional Table,Union/Multi-Emp - Accounts - Institutional Table,Found&Endow - Accounts - Institutional Table,Health Care - Accounts - Institutional Table,Insurance - Accounts - Institutional Table,High Net Worth - Accounts - Institutional Table,Wrap Accounts - Accounts - Institutional Table,Sub-Advised - Accounts - Institutional Table,Supranationals - Accounts - Institutional Table,Sov Wealth Funds - Accounts - Institutional Table,Other - Accounts - Institutional Table,Calculated Total - Accounts - Institutional Table,Total Assets - Defined Contribution Table,Taxable Assets - Defined Contribution Table,Tax-Exempt Assets - Defined Contribution Table,Institutional Assets - Defined Contribution Table,Tax-Exempt Accounts - Defined Contribution Table,Institutional Accounts - Defined Contribution Table,United States - Assets Table,Canada - Assets Table,United Kingdom - Assets Table,Japan - Assets Table,Australia - Assets Table,Hong Kong - Assets Table,Singapore - Assets Table,Other Asia ex-Japan - Assets Table,Africa/Middle East - Assets Table,Latin America - Assets Table,Other - Assets Table,Europe ex-UK - Assets Table,Denmark - Assets Table,Eastern Europe - Assets Table,Finland - Assets Table,France - Assets Table,Germany - Assets Table,Italy - Assets Table,Netherlands - Assets Table,Norway - Assets Table,Spain - Assets Table,Sweden - Assets Table,Switzerland - Assets Table,Other Europe - Assets Table,United States - Accounts Table,Canada - Accounts Table,United Kingdom - Accounts Table,Japan - Accounts Table,Australia - Accounts Table,Hong Kong - Accounts Table,Singapore - Accounts Table,Other Asia ex-Japan - Accounts Table,Africa/Middle East - Accounts Table,Latin America - Accounts Table,Other - Accounts Table,Europe ex-UK - Accounts Table,Denmark - Accounts Table,Eastern Europe - Accounts Table,Finland - Accounts Table,France - Accounts Table,Germany - Accounts Table,Italy - Accounts Table,Netherlands - Accounts Table,Norway - Accounts Table,Spain - Accounts Table,Sweden - Accounts Table,Switzerland - Accounts Table,Other Europe - Accounts Table,Total - Assets Table,Separate Account - Assets Table,Pooled/Commingled - Assets Table,Mutual Fund Institutional - Assets Table,Mutual Fund Retail - Assets Table,Total - Accounts Table,Separate Account - Accounts Table,Pooled/Commingled - Accounts Table,Mutual Fund Institutional - Accounts Table,Mutual Fund Retail - Accounts Table,Accounts - Gained Table,Assets ($M) - Gained Table,% Gained - Gained Table,Accounts - Lost Table,Assets ($M) - Lost Table,% Lost - Lost Table,Rank - Largest Client 1,Account Type - Largest Client 1,Aggregate Account Size - Largest Client 1,Rank - Largest Client 2,Account Type - Largest Client 2,Aggregate Account Size - Largest Client 2,Rank - Largest Client 3,Account Type - Largest Client 3,Aggregate Account Size - Largest Client 3,Rank - Largest Client 4,Account Type - Largest Client 4,Aggregate Account Size - Largest Client 4,Rank - Largest Client 5,Account Type - Largest Client 5,Aggregate Account Size - Largest Client 5,Range - < $1 Million,Total Assets in Range - < $1 Million,Total Accounts in Range - < $1 Million,Range - $1 Million - $10 Million,Total Assets in Range - $1 Million - $10 Million,Total Accounts in Range - $1 Million - $10 Million,Range - $10 Million - $100 Million,Total Assets in Range - $10 Million - $100 Million,Total Accounts in Range - $10 Million - $100 Million,Range - $100 Million - $500 Million,Total Assets in Range - $100 Million - $500 Million,Total Accounts in Range - $100 Million - $500 Million,Range - > $500 Million,Total Assets in Range - > $500 Million,Total Accounts in Range - > $500 Million
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1
12/2023,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,< $ Million,,,$ Million - $ Million,,,$ Million - $ Million,,,$ Million - $ Million,,,> $ Million,,
09/2023,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,,,,<NO APX> / $,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,,,,,"<NO APX> / $,",,,,,<NO APX> / $,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Sub-Advised,,,Sub-Advised,,,Public / Public (Government),,,Sub-Advised,,,Corporate,,< $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,> $ Million,"<NO APX> / $,",
06/2023,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,,,,<NO APX> / $,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,"/ $,",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Sub-Advised,,,Sub-Advised,,,Public / Public (Government),,,Corporate,,,Other,,< $ Million,,,$ Million - $ Million,,,$ Million - $ Million,,,$ Million - $ Million,,,> $ Million,,
03/2023,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,<NO APX> / $,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,,,,<NO APX> / $,,,,<NO APX> / $,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,,,<NO APX> / $,,,"<NO APX> / $,",,,,,<NO APX> / $,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<NO APX> / $,","<NO APX> / $,",<NO APX> / $,"<NO APX> / $,",,,,,,,,<NO APX> / $,%,,<NO APX> / $,%,,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Public (Government),<NO APX> / $,,<NO APX> / Corporate,<NO APX> / $,,<NO APX> / Wrap Account,<NO APX> / $,< $ Million,<NO APX> / $,,$ Million - $ Million,,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,> $ Million,"<NO APX> / $,",
12/2022,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,<NO APX> / $,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,,,,<NO APX> / $,,,,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,,<NO APX> / $,,,,<NO APX> / $,,,"<NO APX> / $,",,,,"<NO APX> / $,",,,,,,,,,,,,,,,,,,,,,"<NO APX> / $,",,,,,<NO APX> / $,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"<NO APX> / $,","<NO APX> / $,",<NO APX> / $,,,,,,,,,<NO APX> / $,%,,<NO APX> / $,%,,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Public (Government),<NO APX> / $,,<NO APX> / Corporate,<NO APX> / $,,<NO APX> / Corporate,<NO APX> / $,< $ Million,<NO APX> / $,,$ Million - $ Million,,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,> $ Million,"<NO APX> / $,",
09/2022,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,,,,,,,,,,,,,,,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",<NO APX> / $,<NO APX> / $,<NO APX> / $,"<NO APX> / $,",,,,,,,,,,,,,,,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,,,"<NO APX> / $,",<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,<NO APX> / $,,,,,,,,,,,,,,,,,,,,,,,,,"<NO APX> / $,","<NO APX> / $,",<NO APX> / $,<NO APX> / $,<NO APX> / $,,,,,,,<NO APX> / $,%,,<NO APX> / $,%,,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Sub-Advised,"<NO APX> / $,",,<NO APX> / Public (Government),<NO APX> / $,,<NO APX> / Insurance,<NO APX> / $,,<NO APX> / Sub-Advised,<NO APX> / $,< $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,$ Million - $ Million,<NO APX> / $,,> $ Million,"<NO APX> / $,",


In [217]:
final_dict = []
nodata_ = [] 

excel_file = []

# For loop to select the Excel file:
for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
            try:
                # For loop to select the sheet name (vehicle):
                    # Will do a try and except since there are sheets that don't exist in the files, so the code doesn't crash:
                print('Checking sheet name: ', sheet_names[j])
                        # Defining the Excel file to be openned and the sheet we need from the book:
                excel_file_orig = pd.read_excel(file_path+'/'+file_names[i], sheet_name=sheet_names[j])
                    # If sheet is not found then let's try this so the code can continue:
            except:
    
                #print('No sheet found for the vehicle {}'.format(sheet_names[j]))
                dict_ = {'Database': excel_file_content.iloc[4][1],            # Database name e.g. "Wilshire"
                excel_file_orig.iloc[6][1]: "No audit data generated.",        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
                    } 
                nodata_.append(dict_) # Adding the respective database and vehicle name that does not exist to list
                output_df_ = pd.DataFrame(nodata_).groupby(['Database']).sum() # Grouping dataframe by database
                continue
                # Selecting the header names placed in row 7 (row in which we start to have some relevant information to gather):
            excel_file_orig.rename(columns = excel_file_orig.iloc[7], inplace= True)
                # Selecting the rows with data and resetting the index:  
            excel_file = excel_file_orig[7:][1:].set_index(['Date'], drop=True)
                # Checking data type of all columns in the file:

                # Date column does not have the correct type, the others are mixed due to special characters being in them such as /
                # We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
            excel_file.index = pd.to_datetime(excel_file.index)
                # Selecting data in the dataframe by the correct date:
            excel_file = excel_file[~(excel_file.index < '09/2022')]
                # Setting up the correct format for the index/Date column
            excel_file.index = excel_file.index.strftime("%m/%Y")
                # Dropping rows and columns in which all the cells contain NaN values:
            excel_file = excel_file.dropna(how='all', axis=0).dropna(how='all', axis=1) 

            
            for n in range(0, excel_file.shape[1]):
                     
                    for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
                                                    
                            try:
                                if (int(float(p) >= 0)) or (int(float(p) <= 0)):
                                    excel_file[excel_file.columns[n]][m] = ''
                            except:
                                excel_file = excel_file.replace({r'(-?[0-9\.]+)\s*/ <NO DATA>': '1',     # Data not in database
                                                 r'<NO APX> \s*/ (-?[0-9\.]+)': '',                        # Data not in Vault
                                                 r'(-?[0-9\.]+)\s*/ (-?[0-9\.]+)': '',                     # Data not matching
                                                 r'(-?[0-9\.]+)': ''},                                     # Complete
                                                  regex=True) 
     

            excel_file.fillna('', inplace=True)
            excel_file['Review'] = excel_file[excel_file.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1


            for m,p in enumerate(excel_file)['Review']:
                    if (any('1' in k for k in p)):
                    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 1')  
                    else:  
                    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'No annotation')

                        #df_apx_list.append(df_apx[i])
        # This will check for the data is not in the database/vault/matching:
        #df_apx = df_apx.replace({r'(-?[0-9\.]+)\s*/ <NO DATA>': '1',     # Data not in database
        #                     r'<NO APX> \s*/ (-?[0-9\.]+)': '',                        # Data not in Vault
        #                     r'(-?[0-9\.]+)\s*/ (-?[0-9\.]+)': '',                     # Data not matching
        #                     r'(-?[0-9\.]+)': ''},                                     # Complete
        #                      regex=True)       
    
    # Let's fill the NaN values for easier further processes:
    #df_apx[i].fillna('', inplace=True)
    #df_apx_list.append(df_apx[i])
    # Putting the dummy variables in a single column named 'Review':

Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
Checking sheet name:  P147285
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
Checking sheet name:  P147285
Checking file name:  Data_Audit_Report_Callan_Assoc_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
Checking sheet name:  P147285
Checking file name:  Data_Audit_Report_Camradata_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
Checking sheet name:  P147285
Checking file name:  Data_Aud

KeyboardInterrupt: 

In [224]:
for n in range(0, excel_file.shape[1]):
    for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
        if (r'(-?[0-9\.]+)\s*/ <NO DATA>') in p:
            print('a')

TypeError: argument of type 'float' is not iterable

In [None]:
excel_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
09/2023,,,,,,,,,,,
06/2023,,,,,,,,,,,
03/2023,,,,,,,,,,,
12/2022,,,,,,,,,,,
09/2022,,,,,,,,,,,


In [201]:
df_apx_list[4]#.replace({r'(-?[0-9\.]+)\s*/ <NO DATA>': '4',     # Data not in database
              #              r'<NO APX> \s*/ (-?[0-9\.]+)': '',                        # Data not in Vault
              #              r'(-?[0-9\.]+)\s*/ (-?[0-9\.]+)': '',                     # Data not matching
              #              r'(-?[0-9\.]+)': '1'},                                     # Complete
              #               regex=True)       

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,,,,,,,,,,
06/2023,,,,,,,,,,
03/2023,<NO APX> / 1828.68,<NO APX> / 1798.67,<NO APX> / 30.02,,,<NO APX> / 5,<NO APX> / 2,<NO APX> / 5.22,<NO APX> / 0,<NO APX> / 0
12/2022,<NO APX> / 1690.89,<NO APX> / 1667.55,<NO APX> / 23.33,,,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0
09/2022,<NO APX> / 1579.56,<NO APX> / 1557.44,<NO APX> / 22.12,<NO APX> / 0,<NO APX> / 0,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0


In [103]:
df_apx['Review'] = df_apx[df_apx.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1)
df_apx

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
09/2023,,,,,,,,,,,
06/2023,,,,,,,,,,,
03/2023,,,,,,,,,,,
12/2022,,,,,,,,,,,
09/2022,,,,,,,,,,,


In [105]:
df_apx

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
09/2023,,,,,,,,,,,No annotation
06/2023,,,,,,,,,,,No annotation
03/2023,,,,,,,,,,,No annotation
12/2022,,,,,,,,,,,No annotation
09/2022,,,,,,,,,,,No annotation


In [106]:
# For loop to select the Excel file:
for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
        # Creating a list for each of the periods in the Review column:
        periods_1 = []
        
        # A description list is created to put in the final review without considering empty period lists:
        description_apx = []
        
        # Gathering all the periods (month/year) for which each of these conditions stated above are present:
        for m,p in enumerate(zip(df_apx['Review'], df_apx.index)):                    
            if p[0] == 'Priority 1':
                periods_1.append(p[1])
            elif periods_1 == periods_1:
                description_apx.append("No annotation".format((list(set(periods_1)))).replace("'",'').replace('[','').replace(']',''))
        #0 "Complete"
        #1 "Data not in the database"
        #2 "Data not in the Vault"
        #3 "Data not matching"   
        
        
        if periods_1 := periods_1: description_apx.append("● Priority 1: {}\n".format((list(set(periods_1)))).replace("'",'').replace('[','').replace(']',''))

Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
Checking file name:  Data_Audit_Report_Callan_Assoc_1_2024.xlsx
Checking file name:  Data_Audit_Report_Camradata_1_2024.xlsx
Checking file name:  Data_Audit_Report_eVestment_Alliance_1_2024.xlsx
Checking file name:  Data_Audit_Report_Global_Fund_Search_1_2024.xlsx
Checking file name:  Data_Audit_Report_Global_Manager_Research_1_2024.xlsx
Checking file name:  Data_Audit_Report_Investment_Metrics_1_2024.xlsx
Checking file name:  Data_Audit_Report_LCG_Assoc_1_2024.xlsx
Checking file name:  Data_Audit_Report_Mercer_1_2024.xlsx
Checking file name:  Data_Audit_Report_Morningstar_1_2024.xlsx
Checking file name:  Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx
Checking file name:  Data_Audit_Report_PSN_Informa_1_2024.xlsx
Checking file name:  Data_Audit_Report_Wilshire_1_2024.xlsx


In [107]:
description_apx

['No annotation',
 'No annotation',
 'No annotation',
 'No annotation',
 'No annotation']

In [108]:
# For loop to select the Excel file:
for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
                                
                # Loading the first sheet "Table of Contents" to obtain information that can be input into the output dataframe:
                excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 

                # Building the dictionary that will contain the genetal information: database and description, to then transform it into a dataframe:
                dict = {'Database': excel_file_content.iloc[4][1],      # Database name e.g. "Wilshire"
                        excel_file_orig.iloc[6][1]: description_apx,        # Product/vehicle name with review description of findings e.g. "Core Fixed Income Composite (P73285)"
                        }  
                # Creating a new dataframe that will sum up the findings in the Data Auditor        
                output_df = pd.DataFrame([dict])

                # Putting each description in a single line (this may duplicate the database name):
                output_df0 = output_df.explode(excel_file_orig.iloc[6][1])

                # Final dict
                final_dict.append(output_df0)

                # Transforming into a dataframe the last dictionary with the review description:
                final_dict_ = pd.concat(final_dict) 

                # final_dict_ has a numerical index, whilst output_df_ has databases as its index, so we'll arrange that:
                final_dict_.set_index("Database",drop=True, inplace=True)

Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
Checking file name:  Data_Audit_Report_Callan_Assoc_1_2024.xlsx
Checking file name:  Data_Audit_Report_Camradata_1_2024.xlsx
Checking file name:  Data_Audit_Report_eVestment_Alliance_1_2024.xlsx
Checking file name:  Data_Audit_Report_Global_Fund_Search_1_2024.xlsx
Checking file name:  Data_Audit_Report_Global_Manager_Research_1_2024.xlsx
Checking file name:  Data_Audit_Report_Investment_Metrics_1_2024.xlsx
Checking file name:  Data_Audit_Report_LCG_Assoc_1_2024.xlsx
Checking file name:  Data_Audit_Report_Mercer_1_2024.xlsx
Checking file name:  Data_Audit_Report_Morningstar_1_2024.xlsx
Checking file name:  Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx
Checking file name:  Data_Audit_Report_PSN_Informa_1_2024.xlsx
Checking file name:  Data_Audit_Report_Wilshire_1_2024.xlsx


In [86]:
# For loop to select the Excel file:
for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
        print(output_df)

Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation, No annotation, ...
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
   Database               Core Fixed Income Composite (P73285)
0  Wilshire  [No annotation, No annotation,

In [109]:
final_dict_

Unnamed: 0_level_0,U.S Small-Cap Core Composite (P147285)
Database,Unnamed: 1_level_1
Alpha Portfolio Advisors,No annotation
Alpha Portfolio Advisors,No annotation
Alpha Portfolio Advisors,No annotation
Alpha Portfolio Advisors,No annotation
Alpha Portfolio Advisors,No annotation
...,...
Wilshire,No annotation
Wilshire,No annotation
Wilshire,No annotation
Wilshire,No annotation


In [129]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [131]:
a = pd.DataFrame(output_df_)
a

Unnamed: 0_level_0,Core Fixed Income Composite (P73285),Strategic Fixed Income Composite (P121285),U.S Small-Cap Core Composite (P147285)
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alpha Portfolio Advisors,No audit data generated.,No audit data generated.,0
Broadridge,0,0,No audit data generated.No audit data generate...
Camradata,0,0,No audit data generated.
Global Fund Search,0,0,No audit data generated.No audit data generate...
Global Manager Research,0,0,No audit data generated.No audit data generate...
Mercer,0,0,No audit data generated.No audit data generate...
Preqin Hedge Fund Analyst,0,0,No audit data generated.No audit data generate...
eVestment Alliance,0,0,No audit data generated.No audit data generate...


In [122]:
a.groupby('Database')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E5BB6C1BA0>

In [88]:
# Joining these two tables together and grouping them by database:
review_file = pd.merge(final_dict_, output_df_, on='Database', how='outer').groupby('Database').sum()
review_file

Unnamed: 0_level_0,Core Fixed Income Composite (P73285)_x,Core Fixed Income Composite (P73285)_y
Database,Unnamed: 1_level_1,Unnamed: 2_level_1
Alpha Portfolio Advisors,No annotationNo annotationNo annotation● Prior...,0
Broadridge,No annotationNo annotationNo annotation● Prior...,0
Callan Assoc.,No annotationNo annotationNo annotation● Prior...,0
Camradata,No annotationNo annotationNo annotation● Prior...,0
Global Fund Search,No annotationNo annotationNo annotation● Prior...,0
Global Manager Research,No annotationNo annotationNo annotation● Prior...,0
Investment Metrics,No annotationNo annotationNo annotation● Prior...,No audit data generated.No audit data generate...
LCG Assoc.,No annotationNo annotationNo annotation● Prior...,0
Mercer,No annotationNo annotationNo annotation● Prior...,0
Morningstar,No annotationNo annotationNo annotation● Prior...,0


In [46]:
review_file

Unnamed: 0_level_0,U.S Small-Cap Core Composite (P147285)_x,Core Fixed Income Composite (P73285),Strategic Fixed Income Composite (P121285)
Database,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alpha Portfolio Advisors,No audit data generated.,No audit data generated.,No audit data generated.
Broadridge,No audit data generated.,No audit data generated.,No audit data generated.
Camradata,No audit data generated.,No audit data generated.,No audit data generated.
Global Fund Search,No audit data generated.,No audit data generated.,No audit data generated.
Global Manager Research,No audit data generated.,No audit data generated.,No audit data generated.
Mercer,No audit data generated.,No audit data generated.,No audit data generated.
Preqin Hedge Fund Analyst,No audit data generated.,No audit data generated.,No audit data generated.
Wilshire,No annotationNo annotationNo annotationNo anno...,No audit data generated.,No audit data generated.
eVestment Alliance,No audit data generated.,No audit data generated.,No audit data generated.


In [89]:
# Sorting column names and Database names:
review_file = review_file.reindex(sorted(review_file.columns), axis=1)
# Sorting index alphabetically (case insensitive):
review_file = review_file.reindex(index=(sorted(review_file.index, key=lambda s: s.lower())))
# Making sure index name doesn't get lost:
review_file.index.name = 'Database'
# Output path with respective name:
# excel_output = r'C:\Users\l.arguello\Documents\Python Scripts\APX_automation_reports\output\data_auditor_review\DataAuditor_review_{}_APX.xlsx'.format(dataset)

# Adding legend/keys table:
legend_dict = {'Priority 1': "" }
legend_keys = pd.DataFrame([legend_dict])
legend_keys = legend_keys.set_axis(['Legend'], axis='index').transpose()

In [90]:
review_file

Unnamed: 0_level_0,Core Fixed Income Composite (P73285)_x,Core Fixed Income Composite (P73285)_y
Database,Unnamed: 1_level_1,Unnamed: 2_level_1
Alpha Portfolio Advisors,No annotationNo annotationNo annotation● Prior...,0
Broadridge,No annotationNo annotationNo annotation● Prior...,0
Callan Assoc.,No annotationNo annotationNo annotation● Prior...,0
Camradata,No annotationNo annotationNo annotation● Prior...,0
eVestment Alliance,No annotationNo annotationNo annotation● Prior...,0
Global Fund Search,No annotationNo annotationNo annotation● Prior...,0
Global Manager Research,No annotationNo annotationNo annotation● Prior...,0
Investment Metrics,No annotationNo annotationNo annotation● Prior...,No audit data generated.No audit data generate...
LCG Assoc.,No annotationNo annotationNo annotation● Prior...,0
Mercer,No annotationNo annotationNo annotation● Prior...,0


In [None]:

# ___________________________________________________________ Data to add in chart ________________________________________________________________
            ## Using this for loop to gather data for the APX file review:
            #                    
            #for n in range(0, excel_file.shape[1]):
            #
            #    for m,p in enumerate(excel_file[excel_file.columns[(n)]]):
            #    
            #        # Avoiding code crashes using try/except:    
            #        try:
            #            if float(p) >= 0 or float(p) <= 0:
            #            
            #                excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '0') # "Complete"

            #        except:
            #            if " / <NO DATA>" in p:
            #                excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '1') # "Data not in the database"
            #            elif "<NO APX> / " in p:
            #                excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '2') # "Data not in the Vault" 
            #            elif " / " in p:
            #                excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '3') # "Data not matching"
            #            else:
            #                excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '')  # If the cell does not contaelevant for our analysis/reviewal
# _______________________________________________________________ Data to add in chart ________________________________________________________________

    with pd.ExcelWriter(excel_output, engine="xlsxwriter") as writer:
        writer.book.formats[0].set_text_wrap()  # Update global format with text_wrap
        legend_keys.to_excel(writer, startrow = 1, startcol = 1) # Export to Excel file
        review_file.to_excel(writer, startrow = 6, startcol = 1)     
    # ////////////////////////////////////////////////////////////////////////////////
    # //////////////////////////////////Extra Steps//////////////////////////////////
    # //////////////////////////////////////////////////////////////////////////////        
    ## Accessing the Pandas file and sheet to add plot:
        # Loading worksheet for some formatting:
        worksheet = writer.sheets['Sheet1']
        # Set border color for tables and set vertical alignment of text:
        file_format = writer.book.add_format()
        file_format.set_text_wrap(True)
        file_format.set_border_color('#A6A6A6')
        file_format.set_align('left')
        file_format.set_valign('vcenter')
        for col_num, value in enumerate(review_file.columns.values):    
            header_format = writer.book.add_format({'bold':True, 'fg_color': '#F2F2F2', 'border_color':'black'})
            worksheet.write(6, col_num+2, value, header_format) # Set header format in soft gray color
        worksheet.set_column('B:H', 19.86, file_format)  # Set size of column (19.86 pixels     
        # Formatting cells:
        # Create a format to use in a merged range
        merge_format1 = writer.book.add_format(
            {
                "bold": 1,
                "border": 1,
                "align": "center",
                "valign": "vcenter",
                "fg_color": "#FCE4D6",
            })
        merge_format2 = writer.book.add_format(
            {
                "border": 1,
                "align": "left",
                "valign": "vcenter",
            })
    
        worksheet.merge_range("C2:F2", "Legend", merge_format1)
        worksheet.merge_range("C3:F3", "Data not in the database // APX needs to distribute this data", merge_format2)       
        
        writer.close()
# This is just the time the process took to complete per dataset
timetaken = (time.time() - start_time)/60
print("Task completed in %.2f minutes" % timetaken)