# Extract data from all in test set



In [1]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import os
import json
import pickle
import pdfplumber


from wealthnet_helpers import *

In [8]:
no_title_tables = {'career_history':['Company', 'Type', 'Position Held', 'YearStarted', 'YearEnded',
       'Remarks'],
       'philanthropy':['giving_profile'],
       'political_interests':['Political Profile'],
       'known_associates':['Name', 'Company', 'Relationships', 'Remarks'],
       'service_providers':['Name', 'Company', 'Position', 'Remarks'],
       'family_details':['Relation', 'Name', 'Age', 'Remarks']
       }

table2df_params = pd.DataFrame({'table_name':['Family Details', 'Biography','Career History', 
                                              'Wealth Analysis', 'Clubs & Boards', 'Known Associates', 'Service Providers', 'Philanthropy'],
'table_id':['family_details', 'biography','career_history', 
                                              'wealth_analysis', 'clubs_and_boards', 'known_associates', 'service_providers', 'philanthropy'],
'data_start':[3,1,2,2,2,2,2,1],
'column_names':[2,0,1,1,1,1,1,0]})

In [9]:
def process_city_files(city_path, new_folder_path):
    target_files = os.listdir(city_path)
    city_name = os.path.basename(city_path)

    city_json_folder = os.path.join(new_folder_path, city_name)
    os.makedirs(city_json_folder, exist_ok=True)

    # Initialize an empty list to store file process results
    results = []

    for file_path in target_files:
        try:
            pdf_file_path = os.path.join(city_path, file_path)
            json_file_name = os.path.splitext(file_path)[0] + '.pkl'
            json_file_path = os.path.join(city_json_folder, json_file_name)
            print(file_path)
            name = re.search(r'Wealth-X (.+?)(?:\sDossier(?:\s\(\d+\))?\.pdf)', file_path).group(1)

            pdf = pdfplumber.open(pdf_file_path)

            tables_dict = extract_all_tables(pdf, table2df_params, no_title_tables)

            tables_dict['summary'] = extract_summary_data(pdf)

            tables_dict = clean_uhnw_dict_elements(tables_dict)

            tables_dict = combine_dataframes_with_suffix(tables_dict)

            with open(json_file_path, 'wb') as file:
                pickle.dump(tables_dict, file)
            
            pdf.close()

            # If no exceptions were raised, then processing was successful
            results.append({
                'file_name': file_path,
                'file_location': pdf_file_path,
                'status': 'success'
            })

        except Exception as e:
            # If an exception was raised, then processing failed
            results.append({
                'file_name': file_path,
                'file_location': pdf_file_path,
                'status': 'failed',
                'error': str(e)  # Capture the error message
            })

    # Convert results into a DataFrame
    results_df = pd.DataFrame(results)

    # Return the DataFrame
    return results_df


In [4]:
city_path = './data/cleaned_cities/London'
file_path = 'Wealth-X Tatparanandam Ananda Krishnan Dossier.pdf'

pdf = pdfplumber.open(city_path+ '/' + file_path )

name = re.search(r'Wealth-X (.+?)(?:\sDossier(?:\s\(\d+\))?\.pdf)', file_path).group(1)
print(name)
tables_dict = extract_all_tables(pdf, table2df_params, no_title_tables, verbose=True)

tables_dict['summary'] = extract_summary_data(pdf)

tables_dict = clean_uhnw_dict_elements(tables_dict)

tables_dict = combine_dataframes_with_suffix(tables_dict, verbose=True)


Tatparanandam Ananda Krishnan
page number 1 number of tables: 0
page number 2 number of tables: 1
Career History
career_history_2
page number 3 number of tables: 1
Company
career_history_3
page number 4 number of tables: 7
Company
career_history_4
Wealth Analysis
wealth_analysis_2
List of Assets




page number 5 number of tables: 1
Interesting Facts
page number 6 number of tables: 3
Significant Litigation
Interests, Passions, Hobbies
Clubs & Boards
clubs_and_boards_2
page number 7 number of tables: 4
Philanthropy
philanthropy_2
Education History
Political Interests
Known Associates
known_associates_2
page number 8 number of tables: 1
Name
known_associates_3
page number 9 number of tables: 4
Name
known_associates_4
Service Providers
service_providers_2
Family Details
family_details_2
Contact Information
defaultdict(<class 'list'>, {'career_history': ['career_history_2', 'career_history_3', 'career_history_4'], 'wealth_analysis': ['wealth_analysis_2'], 'clubs_and_boards': ['clubs_and_bo

In [5]:
tables_dict.keys()

dict_keys(['career_history', 'clubs_and_boards', 'philanthropy', 'known_associates', 'service_providers', 'family_details', 'summary'])

In [15]:
data_dict = copy.deepcopy(tables_dict)

elements_to_clean = ['known_associates', 'family_details']
elements_pattern = "|".join([fr"{elem}_\d+" for elem in elements_to_clean]) + "|" + "|".join(elements_to_clean)

for element in data_dict.keys():
    print(element)
    if re.match(elements_pattern, element):
        data_dict[element] = clean_unhw_dataframe(data_dict[element])


biograhy_2
career_history_2
wealth_analysis_2
known_associates_2
family_details_2


KeyError: 'Name'

In [12]:
tables_dict

{'biograhy_2':                                           Biography
 0  Honorary Consul-General of Singapore in Nigeria.,
 'career_history_2':                                    Company        Type Position Held  \
 0  Government of the Republic of Singapore  Government        Consul   
 
   YearStarted YearEnded                                            Remarks  
 0                        Government of the Republic of Singapore; serve...  ,
 'wealth_analysis_2': Empty DataFrame
 Columns: [Estimated Net Worth, Likely UHNW, , Liquid Estimate, ]
 Index: [],
 'known_associates_2':                       Name                                Company  \
 0        pooja suri khetan                               Game On!   
 1  neal manilal chandaria                          Comcraft Group   
 3     oladele fajemirokun                    Henry Stephens Group   
 5           gaurav dalmia   Landmark Property Development Company   
 
                  Relationships            uhnw  
 0            

['Lagos',
 'Johannesburg',
 'Hong Kong',
 'Rio de Janeiro',
 'San Francisco Bay Area',
 'Mexico City',
 'Frankfurt',
 'Sydney']

In [17]:
base_path = './data/cleaned_cities'
new_folder_path = './data/cities_json'

for city in ['Lagos',
 'Johannesburg',
 'Hong Kong',
 'Rio de Janeiro',
 'San Francisco Bay Area',
 'Mexico City',
 'Frankfurt',
 'Sydney']:#os.listdir(base_path):
    print(city)
    city_path = os.path.join(base_path, city)
    if os.path.isdir(city_path):
        results = process_city_files(city_path, new_folder_path)

Lagos
L114 Wealth-X Ernest Adegunle Oladeinde SHONEKAN Dossier.pdf
L239 Wealth-X John Warrimeme ABEBE Dossier.pdf
L054 Wealth-X Omotayo Bamidele ALAKIJA Dossier.pdf
L234 Wealth-X Offiong Ekanem EJINDU Dossier.pdf
L237 Wealth-X Ukandi Godwin DAMACHI Dossier.pdf
L079 Wealth-X Oladipo Adedeji JADESIMI Dossier.pdf
L231 Wealth-X Achuzie  EZENAGU Dossier.pdf
L010 Wealth-X Folorunsho  ALAKIJA Dossier.pdf
L032 Wealth-X Ayoola Obafunke OTUDEKO Dossier.pdf
L270 Wealth-X Adodapo  ABIODUN Dossier.pdf
L211 Wealth-X Ehimare Eromosele IDIAHI Dossier.pdf
L001 Wealth-X Aliko Alhaji Mohammed DANGOTE Dossier.pdf
L141 Wealth-X AVM  BELLO Dossier.pdf
L182 Wealth-X Richard Lee KRAMER Dossier.pdf
L179 Wealth-X Gabriel Osawaru IGBINEDION Dossier.pdf
L103 Wealth-X Augustine Ojunekwu AVURU Dossier.pdf
L068 Wealth-X Kessington Adebukunola ADEBUTU Dossier.pdf
L230 Wealth-X Hanish Dhanji CHANDARIA Dossier.pdf
L212 Wealth-X Richard John Tokunbo AKERELE Dossier.pdf
L074 Wealth-X Abubakar Bukola SARAKI Dossier.pdf
L1

In [26]:
results.to_csv('./data/parsing_results.csv')