In [1]:

# This Python code will review the original Data Gap Auditor files generated from the Vault, 
# and will sum up what has been found in those files.
# The code will work to create 2 separate Excel files and will loop through each dataset 
# to be worked on: AUM, Performance, Holdings, and Characteristics.

# The code will then work on that dataset and will create an output Excel file for for each user (APX and Client)
# that will contain a table summing up all the findings. It will also include
# all databases and the products/vehicles stated in the sheet_names variable.

# All the original files are put into the main folder: Manulife_DataAuditor, within it, 
# the folder contains four (4) folders for each dataset separately and within each, 
# the Data_Audit_Report Excel files for each database downloaded from the Vault
# Here's an example of main folder organization for this code: Manulife_DataAuditor/Performance/Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx

# Importing Python libraries that will be used:
import os
import glob 
import pandas as pd
from tqdm import tqdm
import time

import warnings
warnings.filterwarnings("ignore")

start_time = time.time()


In [2]:
# Setting the user for which we'll have an unique output:
users = ['APX', 'Client']

# Setting dataset types:
datasets = ['AUM', 'Performance', 'Holdings', 'Characteristics']


In [3]:
users[0]

'APX'

In [10]:
#for datasets in datasets:
# Vehicles that we're interested in are being listed here:
# Core Fixed Income	                    12776	P73285
# Core Plus Fixed Income	            12777	P74285 
# Global Quality Value	                12783	P85285
# Strategic Fixed Income	            12811	P121285
# Strategic Fixed Income Opportunities	12812	P126285
# US Small Cap Core	                    12823	P147285
sheet_names = ['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']
print('Checking dataset ', 'AUM')
# Setting file path. We'll be opening first the Performance folder:
absolute_path = "C:/Users/l.arguello/Downloads/Manulife_DataAuditor/"
# Full file path:
file_path = absolute_path + 'AUM'
# Using glob to get all the Excel file names in the selected folder, to loop through them:
csv_files = glob.glob(os.path.join(file_path, "[!~]*.xlsx")) # [!~] to ignore temporary/opened files
# Empty list to store file names from folder:
file_names = []
# Loop over the list of Excel files: 
for f in tqdm(csv_files, desc="Loading…",ascii=False, ncols=75):        
        time.sleep(0.03) 
        # Print the location and filename 
        print('File Name:', f.split("\\")[-1]) 
        # Add each Excel file name to file_names list 
        file_names.append(f.split("\\")[-1])      
print("Complete.")

Checking dataset  AUM


Loading…:  21%|██████▏                      | 3/14 [00:00<00:00, 27.15it/s]

File Name: Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
File Name: Data_Audit_Report_Broadridge_1_2024.xlsx
File Name: Data_Audit_Report_Callan_Assoc_1_2024.xlsx
File Name: Data_Audit_Report_Camradata_1_2024.xlsx
File Name: Data_Audit_Report_eVestment_Alliance_1_2024.xlsx


Loading…:  64%|██████████████████▋          | 9/14 [00:00<00:00, 25.75it/s]

File Name: Data_Audit_Report_Global_Fund_Search_1_2024.xlsx
File Name: Data_Audit_Report_Global_Manager_Research_1_2024.xlsx
File Name: Data_Audit_Report_Investment_Metrics_1_2024.xlsx
File Name: Data_Audit_Report_LCG_Assoc_1_2024.xlsx
File Name: Data_Audit_Report_Mercer_1_2024.xlsx
File Name:

Loading…: 100%|████████████████████████████| 14/14 [00:00<00:00, 26.99it/s]

 Data_Audit_Report_Morningstar_1_2024.xlsx
File Name: Data_Audit_Report_Preqin_Hedge_Fund_Analyst_1_2024.xlsx
File Name: Data_Audit_Report_PSN_Informa_1_2024.xlsx
File Name: Data_Audit_Report_Wilshire_1_2024.xlsx
Complete.





In [11]:
# Creating empty lists that will contain reviewed tables:
final_dict = []
nodata_ =[]        
# For loop to select the Excel file:
for i in range(len(file_names)):
    print('Checking file name: ', file_names[i])
    # This variable will contain the first sheet in the Data Audtor (table of contents) which will be needed to fill information in the tables:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # For loop to select the sheet name (vehicle):
    for j in range(len(sheet_names)):
        # Will do a try and except since there are sheets that don't exist in the files, so the code doesn't crash:
        try:
            print('Checking sheet name: ', sheet_names[j])
            # Defining the Excel file to be openned and the sheet we need from the book:
            excel_file_orig = pd.read_excel(file_path+'/'+file_names[i], sheet_name=sheet_names[j])
        # If sheet is not found then let's try this so the code can continue:
        except:
            print('No sheet found for the vehicle {}'.format(sheet_names[j]))
            dict_ = {'Database': excel_file_content.iloc[4][1],            # Database name e.g. "Wilshire"
            excel_file_orig.iloc[6][1]: "No audit data generated.",        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
                } 
            nodata_.append(dict_) # Adding the respective database and vehicle name that does not exist to list
            output_df_ = pd.DataFrame(nodata_).groupby(['Database']).sum() # Grouping dataframe by database
            continue

Checking file name:  Data_Audit_Report_Alpha_Portfolio_Advisors_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
No sheet found for the vehicle P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name:  P126285
No sheet found for the vehicle P126285
Checking sheet name:  P147285
Checking file name:  Data_Audit_Report_Broadridge_1_2024.xlsx
Checking sheet name:  P73285
No sheet found for the vehicle P73285
Checking sheet name:  P74285
No sheet found for the vehicle P74285
Checking sheet name:  P85285
No sheet found for the vehicle P85285
Checking sheet name:  P121285
No sheet found for the vehicle P121285
Checking sheet name:  P126285
No sheet found for the vehicle P126285
Checking sheet name:  P147285
No sheet found for the vehicle P147285
Checking file name:  Data_Audit_Report_Callan_Assoc_1_2024.xlsx
Checking sheet name:  P73285
Checking sheet name:  P74285
Checking sheet name:  P85285
Checking sheet name:  P121285
Checking sheet name

In [131]:

# Selecting the header names placed in row 7 (row in which we start to have some relevant information to gather):
excel_file_orig.rename(columns = excel_file_orig.iloc[7], inplace= True)
# Selecting the rows with data and resetting the index:  
excel_file = excel_file_orig[7:][1:].set_index(['Date'], drop=True)
# Checking data type of all columns in the file:
excel_file.info()
# Date column does not have the correct type, the others are mixed due to special characters being in them such as /
# We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
excel_file.index = pd.to_datetime(excel_file.index)
# Selecting data in the dataframe by the correct date:
excel_file = excel_file[~(excel_file.index < '09/2022')]
# Setting up the correct format for the index/Date column
excel_file.index = excel_file.index.strftime("%m/%Y")
# Dropping rows and columns in which all the cells contain NaN values:
excel_file = excel_file.dropna(how='all', axis=0).dropna(how='all', axis=1)

<class 'pandas.core.frame.DataFrame'>
Index: 495 entries, 12/2023 to 06/1900
Data columns (total 10 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   Total Product Assets ($MM)             43 non-null     object
 1   Separate Account Assets ($MM)          43 non-null     object
 2   Commingled Account Assets ($MM)        41 non-null     object
 3   Mutual Fund - Institutional ($MM)      39 non-null     object
 4   Mutual Fund - Retail ($MM)             37 non-null     object
 5   Product Number of Accounts             43 non-null     object
 6   Accounts Gained                        40 non-null     object
 7   Assets Gained From New Accounts        40 non-null     object
 8   Accounts Lost                          41 non-null     object
 9   Assets Lost from Termintated Accounts  41 non-null     object
dtypes: object(10)
memory usage: 42.5+ KB


In [132]:
excel_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,1841.7200,17.7600,14.9400,1809.0200,,6,0,0.0000,0,0.0000
06/2023,1943.6800,17.8000,15.8300,1910.0500,,6,1,10.7200,0,0.0000
03/2023,<NO APX> / 1828.68,<NO APX> / 1798.67,<NO APX> / 30.02,,,<NO APX> / 5,<NO APX> / 2,<NO APX> / 5.22,<NO APX> / 0,<NO APX> / 0
12/2022,<NO APX> / 1690.89,<NO APX> / 1667.55,<NO APX> / 23.33,,,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0
09/2022,<NO APX> / 1579.56,<NO APX> / 1557.44,<NO APX> / 22.12,<NO APX> / 0,<NO APX> / 0,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0


In [111]:
apx_file = excel_file.copy()
client_file = excel_file.copy()

for user in users: 
    if user == 'APX':
        # Creating a for loop to assign dummy variables to the Data Gap Auditor report:
        for n in range(0, apx_file.shape[1]):
        
            for m,p in enumerate(apx_file[apx_file.columns[(n)]]):
                try:
                    if float(p) >= 0 or float(p) <= 0:
                        apx_file[apx_file.columns[n]][m] = apx_file[apx_file.columns[n]][m].replace(p, '') # "Complete"

                except:
                 if p==(" / <NO DATA>"):
                        apx_file[apx_file.columns[n]][m] = apx_file[apx_file.columns[n]][m].replace(p, '1') # "Data not in the database" // APX needs to distribute this data

                 else:
                        apx_file[apx_file.columns[n]][m] = apx_file[apx_file.columns[n]][m].replace(p, '')  # If the cell does not contain any of this criteria above, then it's not relevant for our analysis/reviewal
        # Let's fill the NaN values for easier further processes:
        apx_file.fillna('', inplace=True)

    if users[1]== 'Client':
        # Creating a for loop to assign dummy variables to the Data Gap Auditor report:
        for n in range(0, client_file.shape[1]):
  
            for m,p in enumerate(client_file[client_file.columns[(n)]]):  
                try:
                    if float(p) >= 0 or float(p) <= 0:
                        client_file[client_file.columns[n]][m] = client_file[client_file.columns[n]][m].replace(p, '') # "Complete"
                except:                          
                    if (p == ("<NO APX> / ")):
                        client_file[client_file.columns[n]][m] = client_file[client_file.columns[n]][m].replace(p, '2') # "Data not in the Vault" // Client could want APX to distribute this data for them
                    if (p.isdigit(), " / ", p.isdigit()):
                        client_file[client_file.columns[n]][m] = client_file[client_file.columns[n]][m].replace(p, '3') # "Data not matching" // APX needs to review this data until it matches/is Complete     
                    else:
                        client_file[client_file.columns[n]][m] = client_file[client_file.columns[n]][m].replace(p, '')  # If the cell does not contain any of this criteria above, then it's not relevant for our analysis/reviewal
        # Let's fill the NaN values for easier further processes:
        client_file.fillna('', inplace=True)

In [141]:
for n in range(0, excel_file.shape[1]):
        for m,p in enumerate(excel_file[excel_file.columns[(n)]]):  
                   print(p)
                   if "<NO APX> / " in p:
                       print('a')
                       excel_file[excel_file.columns[n]][m] = excel_file[excel_file.columns[n]][m].replace(p, '3') # "Data not matching" // APX needs to review this data until it matches/is Complete   
                       print(p)  

1841.7200
1943.6800
<NO APX> / 1828.68
a
<NO APX> / 1828.68
<NO APX> / 1690.89
a
<NO APX> / 1690.89
<NO APX> / 1579.56
a
<NO APX> / 1579.56
17.7600
17.8000
<NO APX> / 1798.67
a
<NO APX> / 1798.67
<NO APX> / 1667.55
a
<NO APX> / 1667.55
<NO APX> / 1557.44
a
<NO APX> / 1557.44
14.9400
15.8300
<NO APX> / 30.02
a
<NO APX> / 30.02
<NO APX> / 23.33
a
<NO APX> / 23.33
<NO APX> / 22.12
a
<NO APX> / 22.12
1809.0200
1910.0500
nan


TypeError: argument of type 'float' is not iterable

In [129]:
excel_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,1841.7200,17.7600,14.9400,1809.0200,,6,0,0.0000,0,3
06/2023,1943.6800,17.8000,15.8300,1910.0500,,6,1,10.7200,0,3
03/2023,<NO APX> / 1828.68,<NO APX> / 1798.67,<NO APX> / 30.02,,,<NO APX> / 5,<NO APX> / 2,<NO APX> / 5.22,<NO APX> / 0,3
12/2022,<NO APX> / 1690.89,<NO APX> / 1667.55,<NO APX> / 23.33,,,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,3
09/2022,<NO APX> / 1579.56,<NO APX> / 1557.44,<NO APX> / 22.12,<NO APX> / 0,<NO APX> / 0,<NO APX> / 3,<NO APX> / 0,<NO APX> / 0,<NO APX> / 0,3


In [112]:
apx_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,,,,,,,,,,
06/2023,,,,,,,,,,
03/2023,,,,,,,,,,
12/2022,,,,,,,,,,
09/2022,,,,,,,,,,


In [113]:
client_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
06/2023,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
03/2023,,,,3.0,3.0,,,,,
12/2022,,,,3.0,3.0,,,,,
09/2022,,,,,,,,,,


In [103]:



    # Putting the dummy variables in a single column named 'Review':
    excel_file['Review'] = excel_file[excel_file.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1)

    if user == 'APX':                    
        # Creating a for loop to assign the correct description for each number stated above ^:
        for m,p in enumerate(excel_file['Review']):
                if (all('1' in k for k in p)):
                    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 1')  # Data not in the Vault // Client could want APX to distribute this data for them
                # Now we need to continue to put the other conditions:
                elif (any('1' in k for k in p)):
                    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 1')  #'Data not in the Vault')
                else:  
                    excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'No annotation')
        # Creating a list for each of the periods in the Review column:
        periods_1 = []
        # Gathering all the periods (month/year) for which each of these conditions stated above are present:
        for m,p in enumerate(zip(excel_file['Review'],excel_file.index)):                    
            if p[0] == 'Priority 1':
                periods_1.append(p[1])   
        # A description list is created to put in the final review without considering empty period lists:
        description = []
        if periods_1 := periods_1: description.append("● Priority 1: {}\n".format((list(set(periods_1)))).replace("'",'').replace('[','').replace(']',''))

    if user == 'Client':
        # Creating a for loop to assign the correct description for each number stated above ^:
        for m,p in enumerate(excel_file['Review']):
            if (all('2' in k for k in p)):
                excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 2')  # Data not in the database // APX needs to distribute this data
            elif (all('3' in k for k in p)):
                excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 3')  # Data matching
            elif (any('2' in k for k in p)):
                excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 2')    #'Data not in the database')
            elif (any('3' in k for k in p)):
              excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'Priority 3')    #'Data not matching') 
            else:
                excel_file['Review'][m] = excel_file['Review'][m].replace(p, 'No annotation')
        # Creating a list for each of the periods in the Review column:
        periods_2 = []       
        periods_3 = []
        # Gathering all the periods (month/year) for which each of these conditions stated above are present:
        for m,p in enumerate(zip(excel_file['Review'],excel_file.index)):                    
            if p[0] == 'Priority 2':
                periods_2.append(p[1])
            elif p[0] == 'Priority 3':
                periods_3.append(p[1])
        #0 "Complete"
        #1 "Data not in the database"
        #2 "Data not in the Vault"
        #3 "Data not matching"    
        # A description list is created to put in the final review without considering empty period lists:
        description = []
        if periods_2 := periods_2: description.append("● Priority 2: {}\n".format(list(set((periods_2)))).replace("'",'').replace('[','').replace(']',''))
        if periods_3 := periods_3: description.append("● Priority 3: {}\n".format((list(set(periods_3)))).replace("'",'').replace('[','').replace(']',''))  
    # Loading the first sheet "Table of Contents" to obtain information that can be input into the output dataframe:
    excel_file_content = pd.read_excel(file_path+'/'+file_names[i]) 
    # Building the dictionary to then transform it into a dataframe:
    dict = {'Database': excel_file_content.iloc[4][1],      # Database name e.g. "Wilshire"
            excel_file_orig.iloc[6][1]: description,        # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
            }  
    # Creating a new dataframe that will sum up the findings in the Data Auditor        
    output_df = pd.DataFrame([dict])
    # Putting each description in a single line (this may duplicate the database name):

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
09/2023,,,,,,,,,,,Priority 3
06/2023,,,,,,,,,,,Priority 3
03/2023,,,,,,,,,,,Priority 3
12/2022,,,,,,,,,,,Priority 3
09/2022,,,,,,,,,,,Priority 3


In [96]:
excel_file

Unnamed: 0_level_0,Total Product Assets ($MM),Separate Account Assets ($MM),Commingled Account Assets ($MM),Mutual Fund - Institutional ($MM),Mutual Fund - Retail ($MM),Product Number of Accounts,Accounts Gained,Assets Gained From New Accounts,Accounts Lost,Assets Lost from Termintated Accounts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
09/2023,No,No,No,No,No,No,No,No,No,No
06/2023,No,No,No,No,No,No,No,No,No,No
03/2023,3,3,3,No,No,3,3,3,3,3
12/2022,3,3,3,No,No,3,3,3,3,3
09/2022,3,3,3,3,3,3,3,3,3,3


In [101]:
output_df

Unnamed: 0,Database,U.S Small-Cap Core Composite (P147285)
0,Wilshire,"[● Priority 3: 12/2022, 06/2023, 03/2023, 09/2..."
