In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Setting file path. We'll be opening first the Performance folder:
absolute_path = "C:/Users/l.arguello/Downloads/Manulife_DataAuditor/Performance"

In [3]:
# Core Fixed Income	                    12776	P73285
# Core Plus Fixed Income	            12777	P74285 
# Global Quality Value	                12783	P85285
# Strategic Fixed Income	            12811	P121285
# Strategic Fixed Income Opportunities	12812	P126285
# US Small Cap Core	                    12823	P147285

sheet_names = ['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']

In [4]:
# Defining the Excel file to be openned and the sheet we need from the book:
excel_file_orig = pd.read_excel(absolute_path + "/Data_Audit_Report_Wilshire_1_2024.xlsx", sheet_name=sheet_names[0])
# Printing a file sample:
excel_file_orig.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,LEGEND,,,
1,,,,Green Indicates matching data between APX and ...,,
2,,,,Yellow Indicates nonmatching data between APX ...,,"*For a data mismatch, information is formated ..."


In [5]:
# Selecting the header names placed in row 7:
excel_file_orig.rename(columns = excel_file_orig.iloc[7], inplace= True)
# Selecting the rows with data and reseting the index:
excel_file = excel_file_orig[7:][1:].set_index(['Date'], drop=True)

In [6]:
# Checking data type of all columns in the file:
excel_file.info()
# Date column does not have the correct type, the others are mixed due to characters being in them such as /

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 12/2023 to 01/2014
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Monthly Net    119 non-null    object 
 1   Monthly Gross  119 non-null    object 
 2   nan            0 non-null      object 
 3   nan            0 non-null      float64
 4   nan            0 non-null      object 
dtypes: float64(1), object(4)
memory usage: 5.6+ KB


In [7]:
# We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
excel_file.index = pd.to_datetime(excel_file.index)
# Selecting data in the dataframe by the correct date:
excel_file = excel_file[~(excel_file.index < '09/2022')]

In [8]:
# Setting up the correct format for the index/Date column
excel_file.index = excel_file.index.strftime("%m/%Y")

In [9]:
# Printing the dataframe with information since 09/2022:
excel_file

Unnamed: 0_level_0,Monthly Net,Monthly Gross,NaN,NaN,NaN
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12/2023,,,,,
11/2023,4.85,4.87,,,
10/2023,-1.92,-1.90,,,
09/2023,-2.66,-2.64,,,
08/2023,-0.65,-0.63,,,
07/2023,0.00 / 0.02,0.03 / 0.04,,,
06/2023,-0.24 / -0.21,-0.22 / -0.19,,,
05/2023,-1.01 / -1,-0.99 / -0.97,,,
04/2023,0.55 / 0.54,0.57,,,
03/2023,2.03 / 2,2.05 / 2.02,,,


In [10]:
# Dropping rows and columns in which all the cells contain NaN values:
excel_file = excel_file.dropna(how='all', axis=0).dropna(how='all', axis=1)

In [11]:
# Creating a for loop to assign dummy variables to the Data Gad Auditor report:
for n in range(0, excel_file.shape[1]):

    for i,j in enumerate(excel_file[excel_file.columns[(n)]]):
        
        
        try:
            if float(j) >= 0 or float(j) <= 0:

                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '0') # "Complete"
                
        except:
            if "<NO APX> / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '1') # "Data not in the Vault"
            elif " / <NO DATA>" in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '2') # "Data not in the database"
            elif " / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '3') # "Data not matching"          

In [12]:
# Let's fill the NaN values for easier further processes:
excel_file.fillna('', inplace=True)

In [13]:
excel_file

Unnamed: 0_level_0,Monthly Net,Monthly Gross
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
11/2023,0,0
10/2023,0,0
09/2023,0,0
08/2023,0,0
07/2023,3,3
06/2023,3,3
05/2023,3,3
04/2023,3,0
03/2023,3,3
02/2023,3,3


In [14]:
excel_file['Review'] = excel_file[excel_file.columns[0:]].apply(lambda x: ''.join(x.astype(str)), axis=1)
# Load a sample of how it looks like at the moment:
excel_file.head()

Unnamed: 0_level_0,Monthly Net,Monthly Gross,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11/2023,0,0,0
10/2023,0,0,0
09/2023,0,0,0
08/2023,0,0,0
07/2023,3,3,33


In [15]:
# Creating a for loop to assign the correct description to each period:
for i, j in enumerate(excel_file['Review']):
        
        if all('0' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Complete')

        elif all('1' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault')

        elif all('2' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the database')   

        elif all('3' in k for k in j):
            excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not matching') 

In [16]:
excel_file['Review']

Date
11/2023             Complete
10/2023             Complete
09/2023             Complete
08/2023             Complete
07/2023    Data not matching
06/2023    Data not matching
05/2023    Data not matching
04/2023                   30
03/2023    Data not matching
02/2023    Data not matching
01/2023    Data not matching
12/2022             Complete
11/2022                   20
10/2022                   21
09/2022                   20
Name: Review, dtype: object

In [17]:
# Now we need to continue to put the other conditions:
for i,j in enumerate(excel_file['Review']):

    if (('1' in j) and ('0' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not matching')
    
    elif (('2' in j) and ('0' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the database')

    elif (('3' in j) and ('0' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not matching')

    elif (('3' in j) and ('1' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault and not matching')
    
    elif (('2' in j) and ('1' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault and not in the database')
    
    elif (('3' in j) and ('2' in j) and ('1' in j)):
        excel_file['Review'][i] = excel_file['Review'][i].replace(j, 'Data not in the Vault, not matching and not in the database')
    

In [18]:
excel_file

Unnamed: 0_level_0,Monthly Net,Monthly Gross,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11/2023,0,0,Complete
10/2023,0,0,Complete
09/2023,0,0,Complete
08/2023,0,0,Complete
07/2023,3,3,Data not matching
06/2023,3,3,Data not matching
05/2023,3,3,Data not matching
04/2023,3,0,Data not matching
03/2023,3,3,Data not matching
02/2023,3,3,Data not matching


In [19]:
# Creating a list for each of the periods in the Review column:
periods_0 = []
periods_1 = []
periods_2 = []
periods_3 = []

for i,j in enumerate(zip(excel_file['Review'],excel_file.index)):

    if j[0] == 'Complete':
        periods_0.append(j[1])

    elif j[0] == 'Data not in the Vault':
        periods_1.append(j[1])
    
    elif j[0] == 'Data not in the database':
        periods_2.append(j[1])

    elif j[0] == 'Data not matching':
        periods_3.append(j[1])


#0 "Complete"
#1 "Data not in the Vault"
#2 "Data not in the database"
#3 "Data not matching"          

In [20]:
# Sorting the lists by ascending order:
periods_0.sort()
periods_1.sort()
periods_2.sort()
periods_3.sort()

In [21]:
excel_file

Unnamed: 0_level_0,Monthly Net,Monthly Gross,Review
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11/2023,0,0,Complete
10/2023,0,0,Complete
09/2023,0,0,Complete
08/2023,0,0,Complete
07/2023,3,3,Data not matching
06/2023,3,3,Data not matching
05/2023,3,3,Data not matching
04/2023,3,0,Data not matching
03/2023,3,3,Data not matching
02/2023,3,3,Data not matching


#### **✨Output DataFrame:**

I'll be now creating a dictionary to compile the information and have the output as a dataframe by gathering the description found for the respective periods so that they can be inspected.

In [22]:
# Loading the first sheet "Table of Contents" to obtain information that can be input into the output dataframe:
excel_file_content = pd.read_excel(absolute_path + "/Data_Audit_Report_Wilshire_1_2024.xlsx")

In [23]:
# A description list is created to put in the final review without considering empty period lists:
description = []

if periods_1 := periods_1: description.append("✔ Data not in the Vault for the periods: {}".format((periods_1)).replace("'",'').replace('[','').replace(']',''))
if periods_2 := periods_2: description.append("✔ Data not in the database for the periods: {}".format((periods_2)).replace("'",'').replace('[','').replace(']',''))
if periods_3 := periods_3: description.append("✔ Data not matching for the periods: {}".format((periods_3)).replace("'",'').replace('[','').replace(']',''))

In [24]:
# Checking a sample of the final description:
description

['✔ Data not in the database for the periods: 09/2022, 11/2022',
 '✔ Data not matching for the periods: 01/2023, 02/2023, 03/2023, 04/2023, 05/2023, 06/2023, 07/2023']

In [25]:
# Building the dictionary to then transform it into a dataframe:

dict = {'Database': excel_file_content.iloc[4][1],      # Database name e.g. "Wilshire"
        excel_file_orig.iloc[6][1]: description,                  # Product/vehicle name with description of findings e.g. "Core Fixed Income Composite (P73285)"
        }                                               


In [26]:
# Creating a new dataframe that will sum up the findings in the Data Auditor:
output_df = pd.DataFrame([dict])

In [27]:
# Putting each description in a single line (this may duplicate the database name):
output_df = output_df.explode(excel_file_orig.iloc[6][1])

# Setting the column width to the max so the whole line can be read:
pd.set_option('display.max_colwidth', -1)

# Sample of the final review:
output_df

Unnamed: 0,Database,Core Fixed Income Composite (P73285)
0,Wilshire,"✔ Data not in the database for the periods: 09/2022, 11/2022"
0,Wilshire,"✔ Data not matching for the periods: 01/2023, 02/2023, 03/2023, 04/2023, 05/2023, 06/2023, 07/2023"
