In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Setting file path. We'll be opening first the Performance folder:
absolute_path = "C:/Users/l.arguello/Downloads/Manulife_DataAuditor/Performance"

In [3]:
# Core Fixed Income	                    12776	P73285
# Core Plus Fixed Income	            12777	P74285 
# Global Quality Value	                12783	P85285
# Strategic Fixed Income	            12811	P121285
# Strategic Fixed Income Opportunities	12812	P126285
# US Small Cap Core	                    12823	P147285

sheet_names = ['P73285', 'P74285', 'P85285', 'P121285', 'P126285', 'P147285']

In [31]:
# Defining the Excel file to be openned and the sheet we need from the book:
excel_file = pd.read_excel(absolute_path + "/Data_Audit_Report_Morningstar_1_2024.xlsx", sheet_name=sheet_names[0])
# Printing file sample:
excel_file.head(3)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,LEGEND,,,
1,,,,Green Indicates matching data between APX and ...,,
2,,,,Yellow Indicates nonmatching data between APX ...,,"*For a data mismatch, information is formated ..."


In [32]:
# Selecting the header names placed in row 7:
excel_file.columns = excel_file[7:].iloc[0]
# Selecting the rows with data and reseting the index:
excel_file = excel_file[7:][1:].reset_index(drop=True)

In [33]:
# Checking data type of all columns in the file:
excel_file.info()
# Date column does not have the correct type, the others are mixed due to characters being in them such as /

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Date              336 non-null    object
 1   Gross Return (%)  335 non-null    object
 2   Net Return (%)    335 non-null    object
 3   Assets            324 non-null    object
 4   Accounts          285 non-null    object
 5   nan               0 non-null      object
dtypes: object(6)
memory usage: 15.9+ KB


In [34]:
excel_file.head()

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
0,12/2023,,,,,
1,11/2023,4.87298,4.84676,4684.755,,
2,10/2023,-1.8967,-1.92122,4453.165,,
3,09/2023,-2.63552,-2.65986,4551.717,,
4,08/2023,-0.62979,-0.65464,4663.914 / 4026.569,,


In [35]:
# We need information from 09/2022 onwards, so I'll be turning Date column into correct type and then filter by date:
excel_file['Date'] = pd.to_datetime(excel_file['Date'], format = '%m/%Y')
# Selecting data in the dataframe by the correct date:
excel_file = excel_file[~(excel_file['Date'] < '09/2022')]

In [36]:
# Printing the dataframe with information since 09/2022:
excel_file

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
0,2023-12-01,,,,,
1,2023-11-01,4.87298,4.84676,4684.755,,
2,2023-10-01,-1.89670,-1.92122,4453.165,,
3,2023-09-01,-2.63552,-2.65986,4551.717,,
4,2023-08-01,-0.62979,-0.65464,4663.914 / 4026.569,,
5,2023-07-01,0.02650 / 0.04447,0.00149 / 0.01946,4580.570 / 3938.822,,
6,2023-06-01,-0.21631 / -0.18826,-0.24126 / -0.21321,4037.725 / 3395.588,,
7,2023-05-01,-0.98764 / -0.97212,-1.01239 / -0.99688,3880.486 / 3236.112,,
8,2023-04-01,0.57062,0.54548,3778.702 / 3127.212,,
9,2023-03-01,2.05121 / 2.02233,2.02570 / 1.99683,3606.232 / 2958.551,,


In [37]:
for n in range(1, excel_file.shape[1]):

    for i,j in enumerate(excel_file[excel_file.columns[n]]):
        
        
        try:
            if float(j) >= 0 or float(j) <= 0:

                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '0') # "Complete"
        except:
            if "<NO APX> / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '1') # "Data not in the Vault"
            elif " / <NO DATA>" in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '2') # "Data not in the database"
            elif " / " in j:
                excel_file[excel_file.columns[n]][i] = excel_file[excel_file.columns[n]][i].replace(j, '3') # "Data not matching"

In [38]:
# Let's fill the NaN values for easier further processes:
excel_file.fillna('-1', inplace=True)

In [39]:
excel_file

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
0,2023-12-01,-1,-1,-1,-1,-1
1,2023-11-01,0,0,0,-1,-1
2,2023-10-01,0,0,0,-1,-1
3,2023-09-01,0,0,0,-1,-1
4,2023-08-01,0,0,3,-1,-1
5,2023-07-01,3,3,3,-1,-1
6,2023-06-01,3,3,3,-1,-1
7,2023-05-01,3,3,3,-1,-1
8,2023-04-01,0,0,3,-1,-1
9,2023-03-01,3,3,3,-1,-1


In [40]:
excel_file['test'] = excel_file[excel_file.columns[1:]].apply(lambda x: ', '.join(x.astype(str)), axis=1)
# Load a sample of how it looks like at the moment:
excel_file.head()

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN,test
0,2023-12-01,-1,-1,-1,-1,-1,"-1, -1, -1, -1, -1"
1,2023-11-01,0,0,0,-1,-1,"0, 0, 0, -1, -1"
2,2023-10-01,0,0,0,-1,-1,"0, 0, 0, -1, -1"
3,2023-09-01,0,0,0,-1,-1,"0, 0, 0, -1, -1"
4,2023-08-01,0,0,3,-1,-1,"0, 0, 3, -1, -1"


In [41]:
for i,j in enumerate(excel_file['test']):


    if (('1' and '2' and '3') or ('0' and '1'and '2' and '3')) in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault, not in the database, and not matching')

    elif (('1' and '2') or ('0' and '1' and '2')) in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault and not in the database')

    elif (('1' and '3') or ('0' and '1' and '3')) in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault and no matching')

    elif (('2' and '3') or ('0' and '2' and '3')) in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the database and not matching')

    elif ('0' or ('0' and '-1')) in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Complete') 

    elif ('0' and '3') in j:
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not matching')


In [46]:
for i,j in enumerate(excel_file['test']):
    
    if (('3' in j) and ('1' in j) and ('0' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault and not matching')
    
    if (('2' in j) and ('1' in j) and ('0' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault and not in the database')
    
    if (('3' in j) and ('2' in j) and ('1' in j) and ('0' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault, not matching and not in the database')
    
    if (('3' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not matching')

    if (('2' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the database')

    if (('1' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Data not in the Vault')

    if (('0' in j)):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'Complete')

    if (('-1' in j).all()):
        excel_file['test'][i] = excel_file['test'][i].replace(j, 'NV and NDB')

AttributeError: 'bool' object has no attribute 'all'

In [44]:
excel_file['test']

0                                 Data not in the Vault
1                                              Complete
2                                              Complete
3                                              Complete
4     Data not in the Vault, not in the database, an...
5     Data not in the Vault, not in the database, an...
6     Data not in the Vault, not in the database, an...
7     Data not in the Vault, not in the database, an...
8     Data not in the Vault, not in the database, an...
9     Data not in the Vault, not in the database, an...
10    Data not in the Vault, not in the database, an...
11    Data not in the Vault, not in the database, an...
12                                             Complete
13                                             Complete
14                                             Complete
15                                             Complete
Name: test, dtype: object

In [18]:
excel_file['Held?'] = excel_file.apply(
    lambda row: 'Yes' if (i=='0' and i=='-1') for row['test'] else 'No', axis='columns')
        #excel_file['test'][i]=excel_file['test'][i].replace(j, "complete")

SyntaxError: expected 'else' after 'if' expression (2487741947.py, line 2)

In [15]:
excel_file['test']

0     -1, -1, -1, -1, -1
1        0, 0, 0, -1, -1
2        0, 0, 0, -1, -1
3        0, 0, 0, -1, -1
4        0, 0, 3, -1, -1
5        3, 3, 3, -1, -1
6        3, 3, 3, -1, -1
7        3, 3, 3, -1, -1
8        0, 0, 3, -1, -1
9        3, 3, 3, -1, -1
10       3, 3, 3, -1, -1
11       3, 3, 3, -1, -1
12       0, 0, 0, -1, -1
13       0, 0, 0, -1, -1
14       0, 0, 0, -1, -1
15       0, 0, 0, -1, -1
Name: test, dtype: object

In [173]:
for i,j in enumerate(excel_file['test']):

    if (('3') and ('1')) in str(j):
        if ('-1') in str(j):
            excel_file['test'][i] = excel_file['test'][i].replace(str(j), 'Data is not matching, not in the database and complete')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel_file['test'][i] = excel_file['test'][i].replace(str(j), 'Data is not matching, not in the database and complete')


In [174]:
excel_file

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN,test
0,2023-12-01,-1,-1,-1,-1,-1,"Data is not matching, not in the database and ..."
1,2023-11-01,0,0,0,-1,-1,"Data is not matching, not in the database and ..."
2,2023-10-01,0,0,0,-1,-1,"Data is not matching, not in the database and ..."
3,2023-09-01,0,0,0,-1,-1,"Data is not matching, not in the database and ..."
4,2023-08-01,0,0,3,-1,-1,"Data is not matching, not in the database and ..."
5,2023-07-01,3,3,3,-1,-1,"Data is not matching, not in the database and ..."
6,2023-06-01,3,3,3,-1,-1,"Data is not matching, not in the database and ..."
7,2023-05-01,3,3,3,-1,-1,"Data is not matching, not in the database and ..."
8,2023-04-01,0,0,3,-1,-1,"Data is not matching, not in the database and ..."
9,2023-03-01,3,3,3,-1,-1,"Data is not matching, not in the database and ..."


In [83]:
excel_file['Held'] = excel_file.apply(
    lambda row: 'no vault' if ('3' or'0') in row['test'] else 'No', axis='columns')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  excel_file['Held'] = excel_file.apply(


In [84]:
excel_file['Held']

0           No
1           No
2           No
3           No
4     no vault
5     no vault
6     no vault
7     no vault
8     no vault
9     no vault
10    no vault
11    no vault
12          No
13          No
14          No
15          No
Name: Held, dtype: object

In [36]:
excel_file['value']

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
Name: value, dtype: float64

In [115]:
excel_file

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN,test
0,2023-12-01,-1,-1,-1,-1,-1,-1-1-1-1-1
1,2023-11-01,0,0,0,-1,-1,Complete
2,2023-10-01,0,0,0,-1,-1,Complete
3,2023-09-01,0,0,0,-1,-1,Complete
4,2023-08-01,0,0,3,-1,-1,"Data not in the Vault, not in the database, an..."
5,2023-07-01,3,3,3,-1,-1,"Data not in the Vault, not in the database, an..."
6,2023-06-01,3,3,3,-1,-1,"Data not in the Vault, not in the database, an..."
7,2023-05-01,3,3,3,-1,-1,"Data not in the Vault, not in the database, an..."
8,2023-04-01,0,0,3,-1,-1,"Data not in the Vault, not in the database, an..."
9,2023-03-01,3,3,3,-1,-1,"Data not in the Vault, not in the database, an..."


0    2023-12-01
1    2023-11-01
2    2023-10-01
3    2023-09-01
4    2023-08-01
5    2023-07-01
6    2023-06-01
7    2023-05-01
8    2023-04-01
9    2023-03-01
10   2023-02-01
11   2023-01-01
12   2022-12-01
13   2022-11-01
14   2022-10-01
15   2022-09-01
Name: Date, dtype: datetime64[ns]
0                     NaN
1                 4.87298
2                -1.89670
3                -2.63552
4                -0.62979
5       0.02650 / 0.04447
6     -0.21631 / -0.18826
7     -0.98764 / -0.97212
8                 0.57062
9       2.05121 / 2.02233
10     -2.19887 / -2.1758
11      3.62683 / 3.68414
12               -0.37934
13                3.71525
14               -1.62929
15               -4.46798
Name: Gross Return (%), dtype: object
0                     NaN
1                 4.84676
2                -1.92122
3                -2.65986
4                -0.65464
5       0.00149 / 0.01946
6     -0.24126 / -0.21321
7     -1.01239 / -0.99688
8                 0.54548
9       2.02570 / 1.99

In [59]:
excel_file["Assets"][1].int.replace(float(excel_file["Assets"][1]
                                      ), "Complete", regex=True)

AttributeError: 'str' object has no attribute 'int'

In [48]:
excel_file["Assets"]

0                     NaN
1                4684.755
2                4453.165
3                4551.717
4     4663.914 / 4026.569
5     4580.570 / 3938.822
6     4037.725 / 3395.588
7     3880.486 / 3236.112
8     3778.702 / 3127.212
9     3606.232 / 2958.551
10     3425.921 / 2791.82
11     3112.535 / 2463.61
12               2929.579
13               2932.680
14               2814.158
15               2880.654
Name: Assets, dtype: object

In [134]:
excel_file.index[:1:]

Int64Index([0], dtype='int64')

In [118]:
#for index in range(0,5):
#    for excel_file.loc[index]:

In [119]:
# Getting number of columns:
len(excel_file.columns)

6

In [120]:
a = excel_file[excel_file[excel_file.columns[3]].str.contains(" / ", na=False)]

In [121]:
a

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
4,08/2023,-0.62979,-0.65464,4663.914 / 4026.569,,
5,07/2023,0.02650 / 0.04447,0.00149 / 0.01946,4580.570 / 3938.822,,
6,06/2023,-0.21631 / -0.18826,-0.24126 / -0.21321,4037.725 / 3395.588,,
7,05/2023,-0.98764 / -0.97212,-1.01239 / -0.99688,3880.486 / 3236.112,,
8,04/2023,0.57062,0.54548,3778.702 / 3127.212,,
9,03/2023,2.05121 / 2.02233,2.02570 / 1.99683,3606.232 / 2958.551,,
10,02/2023,-2.19887 / -2.1758,-2.22332 / -2.20025,3425.921 / 2791.82,,
11,01/2023,3.62683 / 3.68414,3.60092 / 3.65822,3112.535 / 2463.61,,
17,07/2022,2.47886,2.45324,3107.091 / 3107.177,,
25,11/2021,0.22889,0.20384,3869.332 / 2267.893,,


In [122]:
b = excel_file[excel_file[excel_file.columns[4]].str.contains(" / ", na=False)]

In [123]:
b

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
24,12/2021,-0.19196,-0.21691,3853.344,<NO APX> / 4,
27,09/2021,-0.78686,-0.81166,3823.652,<NO APX> / 3,
30,06/2021,0.91921,0.89398,3704.711,<NO APX> / 3,
33,03/2021,-1.22772,-1.25241,3624.653,<NO APX> / 3,
36,12/2020,0.43161,0.40650,3447.252,<NO APX> / 3,
...,...,...,...,...,...,...
320,04/1997,1.39000,1.37000,28.878,<NO APX> / 1,
321,03/1997,-0.83000,-0.85000,28.822,<NO APX> / 1,
322,02/1997,0.19000,0.16000,29.885,<NO APX> / 1,
323,01/1997,0.61000,0.59000,29.957,<NO APX> / 1,


In [124]:
df = pd.merge(a, b, left_index=True, right_index=True)

In [140]:
for i in excel_file.index:
      if i.str.contains(" / ", na=False):
          print(excel_file.iloc[j])


AttributeError: 'int' object has no attribute 'str'

In [143]:
for (index, colname) in enumerate(excel_file):
    print(index, excel_file[colname])
    


0 0      12/2023
1      11/2023
2      10/2023
3      09/2023
4      08/2023
        ...   
331    05/1996
332    04/1996
333    03/1996
334    02/1996
335    01/1996
Name: Date, Length: 336, dtype: object
1 0           NaN
1       4.87298
2      -1.89670
3      -2.63552
4      -0.62979
         ...   
331    -0.18000
332    -0.40999
333    -0.80001
334    -1.18000
335     0.83000
Name: Gross Return (%), Length: 336, dtype: object
2 0           NaN
1       4.84676
2      -1.92122
3      -2.65986
4      -0.65464
         ...   
331    -0.21000
332    -0.43000
333    -0.83000
334    -1.20000
335     0.81000
Name: Net Return (%), Length: 336, dtype: object
3 0                      NaN
1                 4684.755
2                 4453.165
3                 4551.717
4      4663.914 / 4026.569
              ...         
331                    NaN
332                    NaN
333                    NaN
334                    NaN
335                    NaN
Name: Assets, Length: 336, dtype: objec

In [158]:
for i in excel_file.columns[1:]:
    for j in excel_file[i]:
        if type(j) == str:
            print('ifkdlgjf')

ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
ifkdlgjf
i

In [153]:
for i in excel_file.columns[1:]:
    for j in excel_file[i]:
        print(j)

nan
4.87298
-1.89670
-2.63552
-0.62979
0.02650 / 0.04447
-0.21631 / -0.18826
-0.98764 / -0.97212
0.57062
2.05121 / 2.02233
-2.19887 / -2.1758
3.62683 / 3.68414
-0.37934
3.71525
-1.62929
-4.46798
-2.44458
2.47886
-1.79780
0.10295
-3.69324
-2.91862
-1.23839
-1.98606
-0.19196
0.22889
-0.15595
-0.78686
-0.15399
1.15354
0.91921
0.31739
0.95538
-1.22772
-1.33499
-0.40070
0.43161
1.41243
-0.39773
0.07748
-0.50078
1.73340
1.36608
0.94656
2.40972
-2.36099
1.62094
2.16829
-0.06380
-0.09156
0.25687
-0.49614
2.64775
0.28933
1.19028
1.81343
0.05541
1.99056
0.12708
1.11826
1.60787
0.53892
-0.77699
-0.54041
0.63821
0.12107
0.04743
0.65101
-0.60771
0.56130
-0.82019
-0.98884
0.47029
-0.04099
0.09613
-0.39494
0.91964
0.40384
0.11379
0.77823
0.84732
-0.08190
0.87240
0.48770
0.17150
-2.17440
-0.56760
0.08270
-0.00460
0.93660
1.70640
0.09810
0.85660
1.15070
0.37700
0.96800
-0.49430
-0.08350
0.10390
0.58080
-0.27940
0.77470
-1.02730
-0.16440
-0.26240
0.56900
-0.82900
2.17870
0.06720
0.67960
0.77520
-0.45150

In [179]:
mask = np.column_stack([excel_file[col].str.contains(r" / ", na=False) for col in excel_file])
excel_file.loc[mask.any(axis=1)]

7,Date,Gross Return (%),Net Return (%),Assets,Accounts,NaN
4,08/2023,-0.62979,-0.65464,4663.914 / 4026.569,,
5,07/2023,0.02650 / 0.04447,0.00149 / 0.01946,4580.570 / 3938.822,,
6,06/2023,-0.21631 / -0.18826,-0.24126 / -0.21321,4037.725 / 3395.588,,
7,05/2023,-0.98764 / -0.97212,-1.01239 / -0.99688,3880.486 / 3236.112,,
8,04/2023,0.57062,0.54548,3778.702 / 3127.212,,
...,...,...,...,...,...,...
320,04/1997,1.39000,1.37000,28.878,<NO APX> / 1,
321,03/1997,-0.83000,-0.85000,28.822,<NO APX> / 1,
322,02/1997,0.19000,0.16000,29.885,<NO APX> / 1,
323,01/1997,0.61000,0.59000,29.957,<NO APX> / 1,
