## Prepare each founder details into a JSON file

In [70]:
import pandas as pd
import json

def process_csv_to_json(readFile_path, writeFile_path):
    # Read the CSV file
    df = pd.read_csv(readFile_path)

    # Check if 'json_string' column exists
    if 'json_string' not in df.columns:
        raise ValueError("The CSV file does not contain 'json_string' column")

    # Iterate through each row
    for index, row in df.iterrows():
        #if index>5: break
        # Extract the JSON string
        json_str = str(row['json_string'])

        # Parse the JSON string
        try:
            json_data = json.loads(json_str)
        except:
            print(f"Invalid JSON format in row {index}")
            continue

        # Write JSON data to a file
        with open(f'{writeFile_path}/{index}.json', 'w') as json_file:
            json.dump(json_data, json_file, indent=4)

        if index % 100 == 0:
            print(f"Row {index} processed and saved as '{index}.json'")


In [71]:
successProfilePath = 'founder-data/success_enriched_linkedin_profiles.csv'
successDataDirectory = 'founder-data/successful-founders'
failProfilePath = 'founder-data/fail_enriched_linkedin_profiles.csv'
failDataDirectory = 'founder-data/failed-founders'

process_csv_to_json(successProfilePath, successDataDirectory)
process_csv_to_json(failProfilePath, failDataDirectory)

Row 0 processed and saved as '0.json'
Row 100 processed and saved as '100.json'
Row 200 processed and saved as '200.json'
Row 300 processed and saved as '300.json'
Row 400 processed and saved as '400.json'
Row 500 processed and saved as '500.json'
Row 600 processed and saved as '600.json'
Row 700 processed and saved as '700.json'
Row 800 processed and saved as '800.json'
Row 900 processed and saved as '900.json'
Row 1000 processed and saved as '1000.json'
Row 1100 processed and saved as '1100.json'
Row 1200 processed and saved as '1200.json'
Row 1300 processed and saved as '1300.json'
Row 1400 processed and saved as '1400.json'
Row 1500 processed and saved as '1500.json'
Row 1600 processed and saved as '1600.json'
Row 1700 processed and saved as '1700.json'
Row 1800 processed and saved as '1800.json'
Row 1900 processed and saved as '1900.json'
Row 2000 processed and saved as '2000.json'
Row 2100 processed and saved as '2100.json'
Invalid JSON format in row 2106
Row 2200 processed and s

## Extract founder's company and their description based on founder_profile_url

In [89]:
def find_company_info(founder_linkedin_url, xls):
    # Load the spreadsheet
    #xls = pd.ExcelFile(spreadsheet_path)

    # Load the specific worksheets
    df_linkedin = pd.read_excel(xls, 'Founder Linkedin URLs')
    df_company = pd.read_excel(xls, 'Long company descriptions')
    df_company_details = pd.read_excel(xls, 'Company')

    # Find the organisation id corresponding to the input url
    if founder_linkedin_url in df_linkedin['founder_linkedin_url'].values:
        org_uuid = df_linkedin[df_linkedin['founder_linkedin_url'] == founder_linkedin_url]['org_uuid'].iloc[0]
    else:
        return {"URL Error" : "LinkedIn URL not found in the dataset."}

    # Initialize dictionary to hold all the required information
    company_info = {}

    # Find the org_name and long description using the org_uuid
    if org_uuid in df_company['org_uuid'].values:
        company_description_info = df_company[df_company['org_uuid'] == org_uuid][['org_name', 'long_description']].iloc[0]
        company_info.update(company_description_info.to_dict())
    else:
        company_info["org_name"] = "Not found"
        company_info["long_description"] = "Not found"

    # Find the additional attributes from the "Company" worksheet
    if org_uuid in df_company_details['org_uuid'].values:
        company_additional_info = df_company_details[df_company_details['org_uuid'] == org_uuid][['category_list', 'category_groups_list', 'country_code', 'city']].iloc[0]
        company_info.update(company_additional_info.to_dict())
    else:
        company_info["category_list"] = "Not found"
        company_info["category_groups_list"] = "Not found"
        company_info["country_code"] = "Not found"
        company_info["city"] = "Not found"

    return company_info



In [90]:
# Example usage:
xls = pd.ExcelFile('founder-data/Moneyball 1.1_ Success.xlsx')
result = find_company_info('https://linkedin.com/in/k06aa', xls)
result

{'org_name': '1inch',
 'long_description': '1inch Limited builds decentralized protocols and contributes to the development of the 1inch Network. The synergy of 1inch protocols, including the 1inch Aggregation Protocol, the 1inch Liquidity Protocol and the 1inch Limit Order Protocol, enables the most lucrative, fastest and protected operations in DeFi.',
 'category_list': 'Blockchain,Cryptocurrency,Information Services,Information Technology,Open Source,Software',
 'category_groups_list': 'Financial Services,Information Technology,Other,Payments,Software',
 'country_code': 'KNA',
 'city': 'Old Road Town'}

## Extract relevant features from the JSON file

In [145]:
from datetime import datetime
import json

def extract_founder_details(json_filename):
    """
    A combined function to extract various attributes of a founder from a JSON file.
    Attributes include name, gender, age, self-description, education backgrounds, and employment backgrounds.

    Parameters:
    json_filename (str): The filename of the JSON file

    Returns:
    dict: A dictionary containing all the extracted attributes of the founder
    """
    def timestamp_to_datetime(timestamp):
        """
        Helper function to convert timestamp (in milliseconds) to datetime object.
        """
        return datetime.fromtimestamp(timestamp / 1000)

    def extract_education_details(person_info):
        educations = person_info.get('educations', [])
        education_details = []

        for education in educations:
            institution_name = education.get('institution', {}).get('name', 'N/A')
            degree_name = education.get('degree', {}).get('name', 'N/A')
            major_name = education.get('major', {}).get('name', 'N/A')

            education_details.append((institution_name, degree_name, major_name))

        return education_details

    def extract_employment_details(person_info):
        employments = person_info.get('employments', [])
        employment_details = []

        for employment in employments:
            
            employer_name = employment.get('employer', {}).get('name', 'N/A')
            categories = employment.get('categories', [])
            roles = [category.get('name') for category in categories if 'name' in category]

            if 'from' in employment:
                start_timestamp = employment.get('from', {}).get('timestamp')
                start_datetime = timestamp_to_datetime(start_timestamp)

                if 'to' in employment:
                    end_timestamp = employment.get('to', {}).get('timestamp')
                    end_datetime = timestamp_to_datetime(end_timestamp)
                else:
                    end_datetime = datetime.now()

                duration_years = (end_datetime - start_datetime).total_seconds() / (365 * 24 * 60 * 60)
                start_datetime = start_datetime.strftime('%m/%d/%Y')
                
            else:
                start_datetime, end_datetime, duration_years = 'N/A', 'N/A', 0.0
                
            is_current = employment.get('isCurrent', False)

            employment_details.append((employer_name, roles, duration_years, start_datetime, is_current))

        return employment_details

    try:
        # Load the JSON file
        with open(json_filename, 'r') as file:
            data = json.load(file)

        # Extracting the 'data' key which contains the relevant information
        person_data = data.get('data', [])
        if not person_data or not isinstance(person_data, list) or not person_data[0]:
            return {}

        # Extracting the first element of the data list
        person_info = person_data[0]

        # Extracting various attributes
        name = person_info.get('nameDetail', {}).get('firstName', '') + " " + person_info.get('nameDetail', {}).get('lastName', '')
        gender = person_info.get('gender', {}).get('normalizedValue')
        age = person_info.get('age')
        self_description = person_info.get('description', "")
        education_details = extract_education_details(person_info)
        employment_details = extract_employment_details(person_info)

        return {
            "Name": name,
            "Gender": gender,
            "Age": age,
            "Self-Description": self_description,
            "Education Backgrounds": education_details,
            "Employment Backgrounds": employment_details
        }
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return {"JSON Error" : "Founder Detail Error"}


In [146]:
# Example usage
file_path = 'founder-data/failed-founders/1445.json'
founder_details = extract_founder_details(file_path)
founder_details

{'Name': 'Brian Belcher',
 'Gender': 'Male',
 'Age': 36,
 'Self-Description': 'Brian Belcher is a COO and Co-Founder at Vector. He Attended Santa Clara University Leavey School of Business.',
 'Education Backgrounds': [('Eastlake High School', 'N/A', 'N/A'),
  ('Santa Clara University Leavey School of Business',
   'Commerce',
   'Finance & Economics')],
 'Employment Backgrounds': [('Vector',
   ['Chief Officer',
    'Management',
    'Leadership',
    'Executive',
    'Founder',
    'COO'],
   8.79485781154896,
   '03/01/2015',
   True),
  ('Addepar',
   ['Management', 'Executive', 'Director', 'Sales'],
   1.2465753424657535,
   '12/01/2013',
   False),
  ('Addepar',
   ['Management', 'Executive', 'Director'],
   2.589041095890411,
   '05/01/2011',
   False),
  ('Computod@s',
   ['Management', 'Executive', 'Founder', 'Leadership'],
   1.3342465753424657,
   '09/01/2009',
   False),
  ('Mercado Global', ['Finance'], 0.5863013698630137, '06/01/2009', False),
  ('Santa Clara University E

## Combine and output all features of founders

Note: we encode each founder with a unique id: index_{S, F}

In [147]:
import pandas as pd

def export_all_features(isSuccess, profilePath, dataDirectory, companyPath):
    # Read the CSV file
    df = pd.read_csv(profilePath)
    xls = pd.ExcelFile(companyPath)
    list_of_founder_details = []

    # Iterate through each row
    for index, row in df.iterrows():
        #if index>5: break
        try:
            linkedin_url = row['linkedin_url']

            # Call previous function to extract founder details from JSON file
            founder_details = extract_founder_details(dataDirectory + f'/{index}.json')

            # Call previous function to extract company details given founder's linkedin_url
            founder_company_details = find_company_info(linkedin_url, xls)

            # Combine the two sets of attributes into founder_details
            founder_details.update(founder_company_details)

            # Add in additional attributes
            if isSuccess: founder_details["ID"] = str(index) + "_S"
            else: founder_details["ID"] = str(index) + "_F"
            founder_details['linkedin_url'] = linkedin_url
            founder_details["isSuccess"] = isSuccess

            list_of_founder_details.append(founder_details)
            if index % 100 == 0:
                print(f"Founder {founder_details['ID']} detail created.")
            
        except Exception as e:
            print(f"An error occurred in index {index}: {e}. Founder detail not added.")
            continue
        
    return list_of_founder_details

In [148]:
successProfilePath = 'founder-data/success_enriched_linkedin_profiles.csv'
successDataDirectory = 'founder-data/successful-founders'
sucessCompanyPath = 'founder-data/Moneyball 1.1_ Success.xlsx'
failProfilePath = 'founder-data/fail_enriched_linkedin_profiles.csv'
failDataDirectory = 'founder-data/failed-founders'
failCompanyPath = 'founder-data/Moneyball 1.1_ Fail.xlsx'

list_of_founder_details = export_all_features(True, 
                                              successProfilePath, 
                                              successDataDirectory, 
                                              sucessCompanyPath)

list_of_founder_details += export_all_features(False,
                                              failProfilePath, 
                                              failDataDirectory, 
                                              failCompanyPath)


Founder 0_S detail created.
Founder 100_S detail created.
Founder 200_S detail created.
Founder 300_S detail created.
Founder 400_S detail created.
Founder 500_S detail created.
Founder 600_S detail created.
Founder 700_S detail created.
Founder 800_S detail created.
Founder 900_S detail created.
Founder 1000_S detail created.
Founder 1100_S detail created.
Founder 1200_S detail created.
Founder 1300_S detail created.
Founder 1400_S detail created.
Founder 1500_S detail created.
Founder 1600_S detail created.
Founder 1700_S detail created.
Founder 1800_S detail created.
Founder 1900_S detail created.
Founder 2000_S detail created.
Founder 2100_S detail created.
An error occurred: [Errno 2] No such file or directory: 'founder-data/successful-founders/2106.json'
Founder 2200_S detail created.
Founder 2300_S detail created.
Founder 2400_S detail created.
Founder 2500_S detail created.
Founder 2600_S detail created.
Founder 2700_S detail created.
Founder 2800_S detail created.
Founder 2900

In [149]:
# Exporting to CSV file
column_names = ['ID', 'isSuccess', 'Name', 'Gender', 'Age', 'linkedin_url',
                'Self-Description', 'Education Backgrounds', 'Employment Backgrounds',
                'org_name', 'long_description', 'category_list',
                'category_groups_list','country_code', 'city']

df = pd.DataFrame(list_of_founder_details, columns=column_names)

df.to_csv('Founder Features.csv', index=False)
print(df)

           ID  isSuccess               Name Gender   Age  \
0         0_S       True         Ryan Johns   Male  44.0   
1         1_S       True      Shoaib Makani   Male  38.0   
2         2_S       True         Obaid Khan   Male  35.0   
3         3_S       True        Alex Buttle   Male  44.0   
4         4_S       True        Harry Jones   Male  39.0   
...       ...        ...                ...    ...   ...   
10061  5993_F      False        James Welsh   None  47.0   
10062  5994_F      False        Nick Tackes   None  56.0   
10063  5995_F      False   Akinori Takahagi   Male   NaN   
10064  5996_F      False  Motohiro Yonesaka   Male  42.0   
10065  5997_F      False    Matthew Tullman   None   NaN   

                                           linkedin_url  \
0            https://www.linkedin.com/in/ryan-johns-sf/   
1                    http://www.linkedin.com/in/smakani   
2      https://www.linkedin.com/in/obaid-khan-b77b4357/   
3                https://www.linkedin.com/i

In [122]:
# Exporting to Excel file
df.to_excel('Founder Features.xlsx', index=False)
