In [12]:
import openai
import config  # Import your config.py file
import pandas as pd
import numpy as np
import pickle
import os
import ast
import seaborn as sns
import matplotlib.pyplot as plt
import tiktoken
import time
import re
# Set up the OpenAI API key from the config.py file
openai.api_key = config.api_key 

In [2]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

def load_pickle_file(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

def get_model_response(prompt, engine="gpt-3.5-turbo"):
    
    messages = [
        {"role": "system", "content": "This is the system message"},
        {"role": "user", "content":prompt}
    ]

    response = openai.ChatCompletion.create(
        model=engine,
        messages=messages,
        max_tokens=2000,
        temperature = 0.2,
        top_p = 0.9,
    )

    return response['choices'][0]['message']['content'].strip()

def identify_gics_classes(df, class_list):

    temp_df = df.copy()
    temp_df['classes'] = ""
    for i in range(0, temp_df.shape[0]):

        biography = temp_df.at[i,'Biography']

        prompt = f"""

                    Read the list of Global Industry Classification Standard  (GICS) industry classes shown surrounded by 4 colon's below
                    ::::
                    {class_list}
                    ::::

                    Now read the the biography below surrounded by 3 colons, return a python list of GICS classes that most appropriately match the text
                    :::
                    {biography}
                    :::
                    the return string should be in the form shown below and should contain at least 1 entry
                    [entry_1, entry_2,.. entry_n]
                    """

        temp_df.loc[i, 'classes'] = get_model_response(prompt, engine="gpt-3.5-turbo")

    return temp_df
##
## has timer to prevent exceeding limit
##
def identify_gics_classes(df, class_list, rate_limit=60000):
    temp_df = df.copy()
    temp_df['classes'] = ""
    tokens_accumulated = 0
    start_time = time.time()

    for i in range(0, temp_df.shape[0]):

        biography = temp_df.at[i,'Biography']

        prompt = f"""
                    Read the list of Global Industry Classification Standard  (GICS) industry classes shown surrounded by 4 colon's below
                    ::::
                    {class_list}
                    ::::

                    Now read the the biography below surrounded by 3 colons, return a python list of GICS classes that most appropriately match the text
                    :::
                    {biography}
                    :::
                    the return string should be in the form shown below and should contain at least 1 entry
                    [entry_1, entry_2,.. entry_n]
                    """

        classes, tokens = get_model_response(prompt, engine="gpt-3.5-turbo")
        temp_df.loc[i, 'classes'] = classes
        tokens_accumulated += tokens

        elapsed_time = time.time() - start_time

        if tokens_accumulated / elapsed_time > rate_limit/60:  # tokens per second
            sleep_time = tokens_accumulated/rate_limit - elapsed_time
            time.sleep(sleep_time)  # sleep if rate limit is reached

    return temp_df

    
path_folder = './data/cities_json/London'

In [3]:
all_files = os.listdir(path_folder)

In [4]:
data_root = './data/cities_json'
bios_path = './data/all_bios_embeddings.csv'
missing_bios_path = './data/missing_bios.csv'
if os.path.exists(bios_path):
    # Load the existing CSV if it exists
    all_bios_df = pd.read_csv(bios_path)
    #this conversion takes about 10 secs. a better storage method may be preferable aka, index = file name columns are numeric embeddings
    all_bios_df['embedding'] = all_bios_df['embedding'].apply(ast.literal_eval)
    missing_bio_df = pd.read_csv(missing_bios_path)
else:
    # Run the code to generate DataFrames a and b
    all_bios_df = []
    for path in os.listdir(data_root ):

        all_files = os.listdir(data_root +'/' + path)

        for file in all_files:

            temp_dict = load_pickle_file(data_root +'/' + path +'/'+file)

            if 'biography' not in temp_dict:
                temp_dict['biography'] = pd.DataFrame({'Biography':[""]})
            
            temp_df = temp_dict['biography']
            temp_df['city'] = path
            temp_df['file'] = file
            temp_df['file'] = temp_df['file'].replace('.pkl', "")

            all_bios_df.append(temp_df)


    all_bios_df = pd.concat(all_bios_df, ignore_index=True)
    all_bios_df = all_bios_df[['file', 'city','Biography']]

    all_bios_df['no_bio']=(all_bios_df['Biography'].apply(len)<50)
    missing_bio_df = all_bios_df.loc[all_bios_df['no_bio'] ].copy().drop('no_bio', axis = 1)

    all_bios_df = all_bios_df.loc[~all_bios_df['no_bio'] ].drop('no_bio', axis = 1)

    #this takes about 12 minutes and costs money, only run when necessary!!!
    all_bios_df['embedding'] = all_bios_df['Biography'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

    missing_bio_df.to_csv(missing_bios_path, index = False)
    all_bios_df.to_csv(bios_path, index = False)


In [97]:
all_bios_df.groupby(['city']).size()

city
Frankfurt                 103
Hong Kong                 384
Johannesburg              124
Lagos                     246
London                    394
Mexico City               104
New York                  389
Rio de Janeiro             52
San Francisco Bay Area    390
Sydney                    316
dtype: int64

In [5]:
gics_embeddings_path = './data/gics_embeddings.csv'
if os.path.exists(gics_embeddings_path):
    # Load the existing CSV if it exists
    gics_embeddings_df = pd.read_csv(gics_embeddings_path)
    gics_embeddings_df = gics_embeddings_df['embedding'].apply(ast.literal_eval)

else:
    gics_embeddings_df = pd.read_csv('./data/gics-map-2018.csv')
    gics_embeddings_df['embedding_desc'] = gics_embeddings_df['SubIndustryDescription'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    gics_embeddings_df['embedding'] = gics_embeddings_df['SubIndustry'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
    gics_embeddings_df['token_count'] = gics_embeddings_df['SubIndustryDescription'].apply(lambda x: len(encoding.encode(x)))
    gics_embeddings_df.to_csv(gics_embeddings_path, index=False)



In [6]:

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")



# Perform auto classification of the gics classes

In [11]:
#prevents the calls to gpt being re-run unnecessarily
if 'classes' not in all_bios_df.columns:
    all_bios_df = identify_gics_classes(all_bios_df, str(gics_embeddings_df['Industry'].unique().tolist()))
    all_bios_df.to_csv(bios_path, index = False)


RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-PXtP9xMXWP8KqzH24Bz03Cgi on tokens per min. Limit: 90000 / min. Current: 87745 / min. Contact us through our help center at help.openai.com if you continue to have issues.

# Create a binary multi-label dataframe

In [12]:
binary_data = {}

# Iterate through each class in the original list and construct the binary data
for class_name in gics_embeddings_df['Industry'].unique().tolist():
    binary_data[class_name] = all_bios_df['classes'].apply(lambda x: 1 if class_name in x else 0)

binary_df = pd.DataFrame(binary_data)


KeyError: 'classes'

# Classify ethnicities

In [66]:
data_root = './data/cities_json'
names_path = './data/names_ethnicity.csv'
pattern = r"Wealth-X(.*?)Dossier"

if os.path.exists(names_path):
    # Load the existing CSV if it exists
    names_ethnicity_df = pd.read_csv(names_path)

else:
    # Run the code to generate DataFrames
    names_ethnicity_df = []
    
    for path in os.listdir(data_root):
        full_path = f"{data_root}/{path}"
        all_files = os.listdir(full_path)

        temp_df = pd.DataFrame(all_files, columns=['file'])
        temp_df['city'] = path
        temp_df['file'] = temp_df['file'].str.replace('.pkl', "", regex=False)
        temp_df['name'] = temp_df['file'].str.extract(pattern)
        temp_df['name'] = temp_df['name'].str.lower() 

        names_ethnicity_df.append(temp_df)

    names_ethnicity_df = pd.concat(names_ethnicity_df, ignore_index=True)
    names_ethnicity_df = names_ethnicity_df[['file','city', 'name' ]]
    names_ethnicity_df['name_city'] = [* zip(names_ethnicity_df['name'], names_ethnicity_df['city'])]
    names_ethnicity_df['name_city'] = names_ethnicity_df['name_city'].astype(str)

In [60]:
names_ethnicity_df

Unnamed: 0,file,city,name,name_city
0,L265 Wealth-X Kola Edward EDGAL Dossier,Lagos,kola edward edgal,"( kola edward edgal , Lagos)"
1,L195 Wealth-X Victor Gbolade OSIBODU Dossier,Lagos,victor gbolade osibodu,"( victor gbolade osibodu , Lagos)"
2,L256 Wealth-X Harrison Eyitayo ILORI Dossier,Lagos,harrison eyitayo ilori,"( harrison eyitayo ilori , Lagos)"
3,L145 Wealth-X Deji ALLI Dossier,Lagos,deji alli,"( deji alli , Lagos)"
4,L135 Wealth-X Ishwardas MAHTANI Dossier,Lagos,ishwardas mahtani,"( ishwardas mahtani , Lagos)"
...,...,...,...,...
2863,N341 Wealth-X Robert Jeffrey SPEYER Dossier,New York,robert jeffrey speyer,"( robert jeffrey speyer , New York)"
2864,N276 Wealth-X Madonna Louise Veronica CICCONE ...,New York,madonna louise veronica ciccone,"( madonna louise veronica ciccone , New York)"
2865,N069 Wealth-X Stephen Alan WYNN Dossier,New York,stephen alan wynn,"( stephen alan wynn , New York)"
2866,N133 Wealth-X Fawaz Abdulaziz Fahad ALHOKAIR D...,New York,fawaz abdulaziz fahad alhokair,"( fawaz abdulaziz fahad alhokair , New York)"


In [112]:
countries_dict = {
    1: "Indian Subcontinent",
    2: "Russia",
    3: "China",
    4: "Anglophone (United States, United Kingdom, Australia, Canada, etc.)",
    5: "Germany",
    6: "Brazil",
    7: "Middle East and North Africa (MENA)",
    8: "Sub-Saharan Africa",
    9: "Latin America (excluding Brazil)",
    10: "East Asia (excluding China)",
    11: "South East Asia",
    12: "Eastern Europe (excluding Russia)",
    13: "Western Europe (excluding Germany and the Anglophone countries)"
}

def get_model_response2(prompt, engine="gpt-3.5-turbo"):
    messages = [
        {"role": "system", "content": "This is the system message"},
        {"role": "user", "content":prompt}
    ]

    response = openai.ChatCompletion.create(
        model=engine,
        messages=messages,
        max_tokens=2000,
        temperature = 0.2,
        top_p = 0.9,
    )
    
    #tokens_used = response['usage']['total_tokens']

    return response['choices'][0]['message']['content'].strip()

def identify_ethncity_classes(df, class_list, chunk_size = 100 , rate_limit=60000):
    temp_df = df.copy()
    temp_df['classes'] = ""
    tokens_accumulated = 0
    start_time = time.time()

    # Empty list to store processed chunks
    processed_chunks = []

    for i in range(0, len(df), chunk_size):
        chunk = temp_df.iloc[i:i + chunk_size]  # Get the current chunk

        pairs = chunk.loc[:,'name_city']
        pairs_string = '\n'.join([f'{p[0]} from {p[1]}' for p in pairs])

        prompt = f"""
                    Read the list of country of origin industry classes shown surrounded by 4 colon's below
                    ::::
                    {class_list}
                    ::::

                    Each element of the list below contains a name and city of residence pair, the most appropriate class
                    for each element of the list needs to be provided
                    :::
                    {pairs_string }
                    :::
                    the return string should be in the form shown below
                    [entry_1, entry_2,.. entry_n]
                    """

        classes, tokens = get_model_response(prompt, engine="gpt-3.5-turbo")
        chunk.loc[:, 'classes'] = classes
        tokens_accumulated += tokens

        elapsed_time = time.time() - start_time

        if tokens_accumulated / elapsed_time > rate_limit/60:  # tokens per second
            sleep_time = tokens_accumulated/rate_limit - elapsed_time
            time.sleep(sleep_time)  # sleep if rate limit is reached

        processed_chunks.append(chunk)

    return pd.concat(processed_chunks)



In [198]:
prompt = f"""
            You are an expert in the regional origin of names ::: Read the python dictionary of country of origin ethnicity classes shown surrounded by 4 colon's below. the dictionary is a class id: description pair
            ::::{class_list}::::

            Each element of the list below contains a tuple of (person name, city of residence pair), consider their name and city of residence then
              choose most appropriate ethnicity class id, 
            from the previous dictionary for each element of the list needs to be provided
            :::{pairs}:::
            the return string should be in the form shown below
            [entry_1, entry_2,.. entry_n]
            """

prompt = f"""
            You are an expert in the regional origin of names :::  Read the python dictionary of country of origin ethnicity classes shown surrounded by 4 colon's below. the dictionary is a class id: description pair
            ::::{class_list}::::

            Each element of the list below contains a persons name  choose most appropriate ethnicity class id, 
            from the previous dictionary for each element of the list needs to be provided
            :::{pairs}:::
            the return string should be in the form shown below
            [entry_1, entry_2,.. entry_n]
            """

prompt = """You are an expert in the regional origin of names ::: what is the origin of the following names, with particular consideration of the last name [' kola edward edgal ',
 ' victor gbolade osibodu ',
 ' harrison eyitayo ilori ',
 ' deji  alli ',
 ' ishwardas  mahtani ',
 ' segun  fagboyegun ',
 ' rajan vashdev vaswani ',
 ' gabriel g. boulos ',
 ' olalekan akinsoga akinyanmi ',
 ' ajoritsedere josephine awosika ',
 ' oladipo  odujinrin ']

please select the origin from one of the below categories

{1: 'Indian Subcontinent',
 2: 'Russia',
 3: 'China',
 4: 'Anglophone (United States, United Kingdom, Australia, Canada, etc.)',
 5: 'Germany',
 6: 'Brazil',
 7: 'Middle East and North Africa (MENA)',
 8: 'Sub-Saharan Africa',
 9: 'Latin America (excluding Brazil)',
 10: 'East Asia (excluding China)',
 11: 'South East Asia',
 12: 'Eastern Europe (excluding Russia)',
 13: 'Western Europe (excluding Germany and the Anglophone countries)'}

You response should be in the form [cat_number1, cat_number2, ...,cat_numbern]
for example a two element list = ['david cameron', 'sadiq khan']
    returns a two element response = [4,1]"""


prompt = f"""You are an expert in the regional origin of names ::: what is the origin of the following names, with particular consideration of the last name {pairs}

    please select the origin from one of the below categories

    {class_list}

    You response should be in the form [cat_number1, cat_number2, ...,cat_numbern]
    for example a two element list = ['david cameron', 'sadiq khan', sim sang-jung,'goodluck jonathan', 'angela merkle']
        returns a two element response = [4,1, 10, 8, 4]"""

prompt = f""" You are an expert in the regional origin of names ::: read the dictionary of names classes {class_list}
    now look at the question answer format below
    question :['david cameron', 'sadiq khan', sim sang-jung,'goodluck jonathan', 'angela merkle']
    answer:[4,1, 10, 8, 4]
    
    respond appropriately
    question:{pairs}
    
    """

name_origin_json  =     {
  "david cameron": 4,
  "sadiq khan": 1,
  "sim sang-jung": 10,
  "goodluck jonathan": 8,
  "angela merkle": 4
}
prompt = f""" You are an expert in the regional origin of names ::: read the dictionary of names classes {class_list}
    
    now see the example name-origin json
    {name_origin_json}

    
    take the list below and return it in the same format as the example
    {pairs}
    
    """

f"""You are an expert in the regional origin of names. read the dictionary of ethnic origin classes {class_list}
    
    now see the example name-origin json, the names have been paired with the most appropriate ethnic/country origin
    {name_origin_json}

    what is the origin of the following names, with particular consideration of the last name, and the fact that all the people live in the city of {city}
    take the list below, wcomplete the origin paying particular consideration of the last name,  and return it in the same format as the example
    {pairs}
    
    """

system_message = f"""You are an expert in the regional origin of names. Read the dictionary of ethnic origin classes (eth_dict) surrounded by triple colons 
:::{countries_dict}:::
    
    now see the example name-origin json, the names have been paired with the most appropriate ethnic/country origin from the eth_dict
    {name_origin_json}

    The user will supply you with a json of peoples names (names_dict) and the city that the people live in. using this information you must reason through the most,
    likely ethnic/regional origin of the name from the list, paying particular attention the last name. 
    Construct your response to have the same format as the example json, however only return the list of extracted values.
    as such your response will have the form
    [value1, value2,...valuen]
    check that the number of entries is equal to the number of names supplied, if the numbers are not equal find the error and correct, then return the list
    
    """


In [228]:
names_ethnicity_df.loc[names_ethnicity_df['city']=='London',:].head(10)#.loc[0:9, :].copy()

Unnamed: 0,file,city,name,name_city
1685,Wealth-X Hamad bin Khalifa bin Hamad bin Abdul...,London,hamad bin khalifa bin hamad bin abdullah bin ...,(' hamad bin khalifa bin hamad bin abdullah bi...
1686,Wealth-X John Lionel Beckwith Dossier,London,john lionel beckwith,"(' john lionel beckwith ', 'London')"
1687,Wealth-X Kristo Kaarmann Dossier,London,kristo kaarmann,"(' kristo kaarmann ', 'London')"
1688,Wealth-X Maritsa Lazari Dossier,London,maritsa lazari,"(' maritsa lazari ', 'London')"
1689,Wealth-X Andreas Serenus Hoffmann Dossier,London,andreas serenus hoffmann,"(' andreas serenus hoffmann ', 'London')"
1690,Wealth-X Carl-Henric Svanberg Dossier,London,carl-henric svanberg,"(' carl-henric svanberg ', 'London')"
1691,Wealth-X Fred Done Dossier,London,fred done,"(' fred done ', 'London')"
1692,Wealth-X Mark Andrew Pears Dossier,London,mark andrew pears,"(' mark andrew pears ', 'London')"
1693,Wealth-X Magdi Abdul Latif Jameel Dossier,London,magdi abdul latif jameel,"(' magdi abdul latif jameel ', 'London')"
1694,Wealth-X Troels Holch Povlsen Dossier,London,troels holch povlsen,"(' troels holch povlsen ', 'London')"


In [4]:
import time
from collections import deque

class RateLimiter:
    def __init__(self, max_tokens_per_minute):
        self.max_tokens_per_minute = max_tokens_per_minute
        self.tokens_deque = deque(maxlen=60) # Holds the tokens generated for the past minute.
        self.timestamps_deque = deque(maxlen=60) # Holds the timestamps of when tokens were generated.

    def add_tokens(self, tokens):
        current_time = time.time()

        # Removing tokens older than 1 minute
        while self.timestamps_deque and current_time - self.timestamps_deque[0] > 60:
            self.timestamps_deque.popleft()
            self.tokens_deque.popleft()

        # If the number of tokens is more than the maximum limit,
        # pause execution until it comes back down below the threshold
        if sum(self.tokens_deque) + tokens > self.max_tokens_per_minute:
            sleep_time = 60 - (current_time - self.timestamps_deque[0])
            time.sleep(sleep_time)

            # After sleeping, add the tokens and timestamps to the deque
            self.tokens_deque.append(tokens)
            self.timestamps_deque.append(current_time + sleep_time)
        else:
            # If the number of tokens is less than the maximum limit,
            # add the tokens and timestamps to the deque
            self.tokens_deque.append(tokens)
            self.timestamps_deque.append(current_time)

    def check_tokens(self, tokens):
        # Function to check if adding new tokens would exceed limit, without actually adding them
        current_time = time.time()
        while self.timestamps_deque and current_time - self.timestamps_deque[0] > 60:
            self.timestamps_deque.popleft()
            self.tokens_deque.popleft()

        return sum(self.tokens_deque) + tokens <= self.max_tokens_per_minute


In [5]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

system_message_tokens = len(encoding.encode(system_message))
prompt_length = len(encoding.encode(prompt))
system_message_tokens +prompt_length 

NameError: name 'system_message' is not defined

In [308]:
system_message = f"""You are an expert in the regional origin of names. Read the dictionary of ethnic origin classes (eth_dict) surrounded by triple colons 
:::{countries_dict}:::
    
    now see the example name-origin json, the names have been paired with the most appropriate ethnic/country origin from the eth_dict
    {name_origin_json}

    The user will supply you with a json of peoples names (names_dict) and the city that the people live in. using this information you must reason through the most,
    likely ethnic/regional origin of the name from the list, paying particular attention the last name. 
    Construct your response to have the same format as the example json
    
    """

def get_model_response_ethn(prompt, system_message,engine="gpt-3.5-turbo"):
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content":prompt}
    ]

    response = openai.ChatCompletion.create(
        model=engine,
        messages=messages,
        max_tokens=2000,
        temperature = 0.2,
        top_p = 0.9,
    )

    return response['choices'][0]['message']['content'].strip()


def get_model_response_ethn(prompt, system_message, rate_limiter, engine="gpt-3.5-turbo"):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ]
    attempts = 0
    while attempts < 5:
        try:
            prompt_length = len(prompt)  # assuming encoding.encode(prompt) is equivalent to len(prompt)
            tokens = len(system_message) + prompt_length
            
            # Add tokens to rate limiter and sleep if necessary
            rate_limiter.add_tokens(tokens)
                
            response = openai.ChatCompletion.create(
                model=engine,
                messages=messages,
                max_tokens=2000,
                temperature=0.2,
                top_p=0.9,
            )
            return response['choices'][0]['message']['content'].strip()
            
        except openai.error.RateLimitError as e:
            print(f"RateLimitError encountered: {e}, waiting for a minute...")
            time.sleep(60)  # Wait for a minute before retrying
            continue  # Continue with the next iteration of the loop, thereby retrying the request
            
        except openai.error.APIError as e:
            print(f"APIError encountered: {e}, retrying in 5 seconds...")
            time.sleep(5)

        except openai.error.TimeoutError as e:
            print(f"TimeoutError encountered: {e}, retrying in 10 seconds...")
            time.sleep(10)
            
        attempts += 1

    print("Failed to get model response after multiple attempts.")
    return None

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

system_message_tokens = len(encoding.encode(system_message))

chunk_size = 50
rate_limit=80000
city = 'Lagos'
total_number = 0
# Empty list to store processed chunks
processed_chunks = []
rate_limiter = RateLimiter(max_tokens_per_minute=rate_limit) 
for city in names_ethnicity_df['city'].unique():

    temp_df = names_ethnicity_df.loc[names_ethnicity_df['city']==city,:].copy().reset_index()
    tokens_accumulated = 0
    start_time = time.time()


    for i in range(0, len(temp_df), chunk_size):
        chunk = temp_df.iloc[i:i + chunk_size].copy().reset_index()  # Get the current chunk

        pairs = chunk.loc[:,'name'].tolist()

        prompt = f""" names_dict :::{pairs}:::
        city :::{city}:::
        """
        #prompt_length = len(encoding.encode(prompt))

        #rate_limiter.add_tokens(system_message_tokens +prompt_length )

        classes = get_model_response_ethn(prompt, system_message, rate_limiter, engine="gpt-3.5-turbo")
        #chunk[ 'classes'] = ast.literal_eval(classes)
        #chunk.loc[:,'classes'] = ast.literal_eval(classes)


        processed_chunks.append(classes)
        total_number = total_number + chunk_size
        print(f"total processed:{total_number}")
#complete_ethnicity = pd.concat(processed_chunks, ignore_index=True)

total processed:50
total processed:100
total processed:150
total processed:200
total processed:250
total processed:300
total processed:350
total processed:400
total processed:450
total processed:500
total processed:550
total processed:600
total processed:650
total processed:700
total processed:750
total processed:800
total processed:850
total processed:900
total processed:950
total processed:1000
total processed:1050
total processed:1100
total processed:1150
total processed:1200
total processed:1250
total processed:1300
total processed:1350
total processed:1400
total processed:1450
total processed:1500
total processed:1550
total processed:1600
total processed:1650
total processed:1700
total processed:1750
total processed:1800
total processed:1850
total processed:1900
total processed:1950
total processed:2000
total processed:2050
total processed:2100
total processed:2150
total processed:2200
total processed:2250
total processed:2300
total processed:2350
total processed:2400
total proces

In [315]:


with open('./ethnicity_dict_list.pkl', 'wb') as file:
    pickle.dump(processed_chunks, file)

In [6]:
with open('./ethnicity_dict_list.pkl', 'rb') as file:
    processed_chunks = pickle.load(file)

In [7]:
ast.literal_eval(processed_chunks[0])

{'kola edward edgal': 8,
 'victor gbolade osibodu': 8,
 'harrison eyitayo ilori': 8,
 'deji alli': 8,
 'ishwardas mahtani': 1,
 'segun fagboyegun': 8,
 'rajan vashdev vaswani': 1,
 'gabriel g. boulos': 6,
 'olalekan akinsoga akinyanmi': 8,
 'ajoritsedere josephine awosika': 8,
 'oladipo odujinrin': 8,
 'karim elie gabriel boulos': 6,
 'gian angelo perrucci': 13,
 'naresh asnani': 1,
 'hakeem abdul olajuwon': 8,
 'si nureni agboola abiola': 8,
 'vinay b. mahtani': 1,
 'gilbert ramez chagoury': 7,
 'suresh murli chellaram': 8,
 'oluwagbemiga a. oyebode': 8,
 'bhagwan ishwardas mahtani': 1,
 'chinedu u. echeruo': 8,
 'rita vaswani': 1,
 'jason chukwuma njoku': 8,
 'bashorun adeniyi adeoye': 8,
 'amisha hathiramani': 1,
 'adolor uwamu': 8,
 'sifawu lawal': 8,
 'daisy ehanire danjuma': 8,
 'samuel adedoyin': 8,
 'ademola benjamin aladekomo': 8,
 'olufemi otedola': 8,
 'aderemi muyinudeen makanjuola': 8,
 'frederick enitiolorunda obateru akinruntan': 8,
 'offiong ekanem ejindu': 8,
 'ayoola 

In [None]:
combined_dict = []
for d in processed_chunks:
    combined_dict.append(ast.literal_eval(d))

In [19]:
combined_dict = {}
for d in range(0,len(processed_chunks)):
    #print(d)
    #some of the dictionaries are truncated and so require parsing with regex to make valid,
    #improvements to the classification mapping would be ideal as it would make this stage irrelevant and ensure that all entries have values
    pairs = re.findall(r"'(.*?)': (\d+)", processed_chunks[d])
    all_valid_dict = {key: int(value) for key, value in pairs}
    combined_dict.update(all_valid_dict)

#
#The code below was the original parser, however it failed for the above reason, it is kept incase the classifier can be fixed
#
#combined_dict = []
#for d in processed_chunks:
#    combined_dict.append(ast.literal_eval(d))

In [24]:
pd.DataFrame(list(combined_dict.items()), columns=['name', 'ethnicity'])

Unnamed: 0,name,ethnicity
0,kola edward edgal,8
1,victor gbolade osibodu,8
2,harrison eyitayo ilori,8
3,deji alli,8
4,ishwardas mahtani,1
...,...,...
2011,phillip allen gamma frost,4
2012,robert jeffrey speyer,4
2013,stephen alan wynn,4
2014,fawaz abdulaziz fahad alhokair,7


In [117]:
class_list

{1: 'Indian Subcontinent',
 2: 'Russia',
 3: 'China',
 4: 'Anglophone (United States, United Kingdom, Australia, Canada, etc.)',
 5: 'Germany',
 6: 'Brazil',
 7: 'Middle East and North Africa (MENA)',
 8: 'Sub-Saharan Africa',
 9: 'Latin America (excluding Brazil)',
 10: 'East Asia (excluding China)',
 11: 'South East Asia',
 12: 'Eastern Europe (excluding Russia)',
 13: 'Western Europe (excluding Germany and the Anglophone countries)'}

In [70]:
test = identify_ethncity_classes(names_ethnicity_df.loc[0:100, :], countries_dict, chunk_size = 100 , rate_limit=60000)

ValueError: too many values to unpack (expected 2)

# identifying industries using cosine similarity

This didn't work very well due to the non-linear multi-label relationships. However, I am keeping it in for now

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(df_A, df_B):
    # Assuming that the embeddings in both dataframes are stored in a column named 'embeddings'
    embeddings_A = np.stack(df_A['embedding'].values)
    embeddings_B = np.stack(df_B['embedding'].values)
    
    # Calculate cosine similarity
    cosine_sim_matrix = cosine_similarity(embeddings_A, embeddings_B)
    
    return cosine_sim_matrix


def compute_similarity_statistics(cosine_sim_matrix, min = 0, max = 1, num_cutoffs=1000):
    cutoffs = np.linspace(min, max, num=num_cutoffs)
    
    statistics = {
        'Cutoff': [],
        'Maximum': [],
        'Minimum': [],
        'Mean': [],
        'Median': [],
        'Standard Deviation': []
    }

    for cutoff in cutoffs:
        row_sums = np.sum(cosine_sim_matrix >= cutoff, axis=1)
        if row_sums.any():
            statistics['Cutoff'].append(cutoff)
            statistics['Maximum'].append(np.max(row_sums))
            statistics['Minimum'].append(np.min(row_sums))
            statistics['Mean'].append(np.mean(row_sums))
            statistics['Median'].append(np.median(row_sums))
            statistics['Standard Deviation'].append(np.std(row_sums))

    result_df = pd.DataFrame(statistics)
    return result_df

test = compute_cosine_similarity(all_bios_df, gics_embeddings_df)

test2 = compute_similarity_statistics(test, 0.7, 0.73, num_cutoffs=1000)
test2.loc[test2['Minimum']==1].sort_values('Cutoff', ascending=False).head()

sns.lineplot(data = test2, x = 'Cutoff', y = 'Minimum')