# Data Preprocessing - Pipeline
Explanation:

## Import Packages
pandas: A library for analyzing, cleaning, and manipulating structured data using DataFrames and Series. \
os: A module for file system operations like creating, deleting, or navigating files and directories. \
glob: A module to find file paths matching patterns (e.g., *.csv) for handling multiple files easily.

In [31]:
import pandas as pd
import os
from glob import glob

## Companies Preprocessing
Explanation:

In [32]:
def preprocess_companies(companies_files):
    all_companies = []

    for file in companies_files:
        companies_df = pd.read_csv(file)
        drop_columns = [
            'Total Funding Amount (in USD)', 'Organization Name URL', 'Operating Status', 'Stage', 'Headquarters Regions', 'Closed Date',
            'Closed Date Precision', 'Company Type', 'Announced Date Precision', 'Price', 'Price Currency',
            'Price (in USD)', 'Acquisition Terms', 'Money Raised at IPO', 'Money Raised at IPO Currency',
            'Money Raised at IPO (in USD)', 'Valuation at IPO', 'Valuation at IPO Currency',
            'Valuation at IPO (in USD)', 'Stock Symbol', 'Stock Symbol URL', 'Stock Exchange',
            'Founded Date Precision', 'Exit Date Precision', 'Number of Events', 'IPO Date',
            'Transaction Name', 'Postal Code', 'Transaction Name URL', 'Estimated Revenue Range',
            'Headquarters Location', 'Last Funding Amount Currency', 'Last Equity Funding Amount Currency',
            'Total Equity Funding Amount Currency', 'Total Funding Amount Currency', 'Last Funding Amount (in USD)', 'Last Equity Funding Amount', 'Last Equity Funding Amount (in USD)', 'Last Equity Funding Type', 'Total Equity Funding Amount', 'Total Equity Funding Amount (in USD)', 'Funding Status', 'Acquired by URL', 'Acquisition Type',
        ]
        companies_df.drop(columns=drop_columns, inplace=True)

        mean_founders = companies_df['Number of Founders'].mean()
        companies_df['Number of Founders'] = companies_df['Number of Founders'].fillna(mean_founders)
        companies_df = companies_df.dropna(subset=['Number of Employees'])

        for col in ['Industries', 'Industry Groups']:
            companies_df[col] = companies_df[col].fillna('').apply(lambda x: [item.strip() for item in x.split(',')])

        def range_to_mean(value):
            low, high = map(int, value.split('-'))
            return (low + high) / 2

        companies_df['Number of Employees'] = companies_df['Number of Employees'].apply(range_to_mean)

        if 'Announced Date' in companies_df.columns:
            companies_df.rename(columns={'Announced Date': 'Announced Date Acquisition'}, inplace=True)

        for date_col in ['Founded Date', 'Exit Date', 'Last Funding Date', 'Announced Date Acquisition']:
            companies_df[date_col] = pd.to_datetime(companies_df[date_col])

        companies_df['IPO'] = companies_df['IPO Status'].map({'Public': 1, 'Private': 0})
        companies_df.drop(columns=['IPO Status'], inplace=True)

        for column in ['Number of Lead Investors', 'Number of Investors', 'Number of Acquisitions']:
            if column in companies_df.columns:
                companies_df[column] = companies_df[column].fillna(0)

        all_companies.append(companies_df)

    merged_companies = pd.concat(all_companies, ignore_index=True)

    merged_companies['ID'] = (merged_companies.index + 1).astype(str).str.zfill(6)

    merged_companies = merged_companies[['ID'] + [col for col in merged_companies.columns if col != 'ID']]

    return merged_companies

## Funding Preprocessing
Explanation:

In [33]:
def preprocess_funding(funding_files, companies_df):

    all_funding = pd.concat([pd.read_csv(file) for file in funding_files], ignore_index=True)

    drop_columns = [
        'Transaction Name', 'Transaction Name URL', 'Organization Name URL', 'Money Raised Currency',
        'Pre-Money Valuation Currency', 'Diversity Spotlight', 'Organization Location', 'Money Raised (in USD)',
        'Pre-Money Valuation (in USD)', 'Organization Description', 'Organization Industries',
        'Organization Website', 'Organization Revenue Range', 'Number of Partner Investors',
        'CB Rank (Funding Round)', 'Total Funding Amount Currency', 'Total Funding Amount (in USD)',
        'Funding Status', 'Equity Only Funding', 'Pre-Money Valuation', 'Total Funding Amount',
        'Number of Funding Rounds', 'Funding Stage'
    ]
    all_funding.drop(columns=drop_columns, inplace=True)

    all_funding = all_funding.merge(
        companies_df[['Organization Name', 'ID']],
        on='Organization Name', how='left'
    )
    all_funding.rename(columns={'ID': 'Company ID'}, inplace=True)
    all_funding['Company ID'] = all_funding['Company ID'].fillna('000000').astype(str).str.zfill(6)

    all_funding['Investor Names'] = all_funding['Investor Names'].fillna('').apply(
        lambda x: [item.strip() for item in x.split(',')]
    )
    all_funding['Announced Date'] = pd.to_datetime(all_funding['Announced Date'])

    all_funding = all_funding.reset_index(drop=True)
    all_funding['Funding ID'] = (all_funding.index + 1).astype(str).str.zfill(6)

    all_funding = all_funding[['Funding ID'] + [col for col in all_funding.columns if col != 'Funding ID']]

    return all_funding

## Investors Preprocessing
Explanation:

In [34]:
def preprocess_investors(investor_files):
    investor_dfs = []
    for file in investor_files:
        df = pd.read_csv(file)
        country = os.path.basename(file).split('-')[1].split('.')[0].upper()
        df['Country'] = country
        investor_dfs.append(df)
    investors = pd.concat(investor_dfs, ignore_index=True)
    investors.drop(columns=['Organization/Person Name URL', 'Investment Stage', 'Regions', 'Founded Date Precision'], inplace=True)

    investors['Investor ID'] = range(1, len(investors) + 1)
    investors['Investor ID'] = investors['Investor ID'].apply(lambda x: str(x).zfill(6))

    cols = ['Investor ID'] + [col for col in investors.columns if col != 'Investor ID']
    investors = investors[cols]

    return investors

## Founders Preprocessing

In [35]:
def process_founders(companies, people):
    required_columns = ['first_name', 'last_name', 'linkedin_url', 'logo_url', 'facebook_url',
                        'twitter_url', 'city', 'region', 'country_code', 'featured_job_title',
                        'featured_job_organization_name']
    missing_columns = [col for col in required_columns if col not in people.columns]
    if missing_columns:
        raise ValueError(f"Missing columns in 'people': {missing_columns}")

    temp_companies = companies.copy()

    temp_companies['Founders'] = temp_companies['Founders'].fillna('').astype(str)
    temp_companies['Founders'] = temp_companies['Founders'].apply(lambda x: [item.strip() for item in x.split(',') if item.strip()])

    founders_table = temp_companies.explode('Founders', ignore_index=True)

    founders = founders_table[['ID', 'Founders', 'Organization Name']].rename(
        columns={'ID': 'Company ID', 'Founders': 'Founder Name'}
    )

    founders['Founder ID'] = founders.index + 1
    founders['Founder ID'] = founders['Founder ID'].astype(str).str.zfill(6)

    founders = founders[['Founder ID', 'Company ID', 'Founder Name', 'Organization Name']]
    founders = founders.dropna(subset=['Founder Name'])
    founders[['first_name', 'last_name']] = founders['Founder Name'].str.split(' ', n=1, expand=True)

    founders['linkedin_url'] = None

    string_columns = ['logo_url', 'facebook_url', 'twitter_url', 'city', 'region',
                      'country_code', 'featured_job_title', 'featured_job_organization_name']
    for col in string_columns:
        if col not in founders.columns:
            founders[col] = None
        founders[col] = founders[col].astype('str')

    for idx, row in founders.iterrows():
        matches = people[
            (people['first_name'] == row['first_name']) &
            (people['last_name'] == row['last_name'])
            ]
        if len(matches) == 1:
            match = matches.iloc[0]
            founders.at[idx, 'linkedin_url'] = match['linkedin_url']
            for col in string_columns:
                founders.at[idx, col] = match[col]

    return founders


def enrich_founders_with_linkedin(founders, founders_linkedin):
    required_columns = ['url', 'followers', 'connections', 'degree_1', 'degree_1_university', 'degree_2', 'degree_2_university']
    missing_columns = [col for col in required_columns if col not in founders_linkedin.columns]
    if missing_columns:
        raise ValueError(f"Missing columns in 'founders_linkedin': {missing_columns}")

    founders_linkedin = founders_linkedin.rename(columns={'url': 'linkedin_url'})

    enriched_founders = founders.merge(founders_linkedin, on='linkedin_url', how='left')

    return enriched_founders


def process_and_enrich_founders(companies, people, founders_linkedin):
    founders = process_founders(companies, people)
    enriched_founders = enrich_founders_with_linkedin(founders, founders_linkedin)

    enriched_founders = enriched_founders.drop_duplicates(subset=['Founder Name'], keep='first')

    return enriched_founders

## Pipeline Execution
Explanation:

In [36]:
companies_files = glob('../../Datasets/Companies/*.csv')
funding_files = glob('../../Datasets/Funding/*.csv')
investor_files = glob('../../Datasets/Investors/*.csv')

companies = preprocess_companies(companies_files)
funding = preprocess_funding(funding_files, companies)
investors = preprocess_investors(investor_files)

people = pd.read_csv("../../Datasets/People/people_CB.csv")

founders_linkedin = pd.read_csv("../../Datasets/LinkedIn/Founders/founders_linkedin.csv")

founders = process_and_enrich_founders(companies, people, founders_linkedin)

## LinkedIn-Company Data

In [43]:
linkedin_files_path = '../../Datasets/LinkedIn/Companies/*.csv'

linkedin_files = glob(linkedin_files_path)

linkedin_companies = [pd.read_csv(file) for file in linkedin_files]
linkedin_companies_combined = pd.concat(linkedin_companies, ignore_index=True)

linkedin_companies_combined['url'] = linkedin_companies_combined['url'].str.strip()
companies['LinkedIn'] = companies['LinkedIn'].str.strip()

linkedin_companies_combined = linkedin_companies_combined.drop(columns=['company_id', 'error'], errors='ignore')

companies = companies.merge(
    linkedin_companies_combined,
    left_on='LinkedIn',
    right_on='url',
    how='left',
    suffixes=('', '_LinkedIn')
)

companies = companies.drop(columns=['url'], errors='ignore')

## Validation

In [44]:
companies

Unnamed: 0,ID,Organization Name,Industries,Description,CB Rank (Company),Founded Date,Exit Date,Website,Twitter,Facebook,...,followers,employeeCount,employeeCountRange,employeeCountRangeMin,employeeCountRangeMax,followers_LinkedIn,employeeCount_LinkedIn,employeeCountRange_LinkedIn,employeeCountRangeMin_LinkedIn,employeeCountRangeMax_LinkedIn
0,000001,2trde,"[Automotive, Software]",2trde develops a software solution designed fo...,57123,2017-01-01,NaT,https://www.2trde.com,,,...,1556,24,11-50,11,50,1556,24,11-50,11,50
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...",36ZEROVision is an AI-powered visual inspectio...,51326,2019-01-01,NaT,https://36zerovision.com/,,,...,1533,18,11-50,11,50,1533,18,11-50,11,50
2,000003,3Bears Foods,[Food and Beverage],3Bears Foods enable a balanced and delicious b...,275817,2015-01-01,NaT,https://3bears.de/,,https://www.facebook.com/3bearsporridge/,...,6797,29,11-50,11,50,6797,29,11-50,11,50
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]",3dTrust helps companies integrate 3D printing ...,134694,2015-01-01,NaT,http://3dtrust.de,https://twitter.com/3dTrust,,...,544,5,2-10,2,10,544,5,2-10,2,10
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...",abaut builds a SaaS that enables businesses al...,219525,2017-07-21,NaT,https://abaut.de,,,...,1924,17,11-50,11,50,1924,17,11-50,11,50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1514,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...",The SAYM platform for swarm mobility defines t...,166910,2019-01-01,NaT,https://www.saym.io/,https://twitter.com/saym_io,https://www.facebook.com/saymcommute/,...,151,20,2-10,2,10,151,20,2-10,2,10
1515,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...",SONAH developed a flexible embedded vision sen...,121752,2016-01-01,NaT,http://www.sonah.tech,,,...,1908,14,11-50,11,50,1908,14,11-50,11,50
1516,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]",Taxy.io builds the leading platform for B2B ta...,220816,2019-01-01,NaT,https://www.taxy.io/,https://twitter.com/taxy_io,,...,4451,39,11-50,11,50,4451,39,11-50,11,50
1517,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...",TRINKKOST is a food supplement manufacturing c...,907817,2016-01-01,NaT,http://www.trinkkost.de,https://www.twitter.com/trinkkost,https://www.facebook.com/trinkkost/,...,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE,OFFLINE


In [45]:
founders

Unnamed: 0,Founder ID,Company ID,Founder Name,Organization Name,first_name,last_name,linkedin_url,logo_url,facebook_url,twitter_url,...,featured_job_title,featured_job_organization_name,username,followers,connections,degree_1,degree_1_university,degree_2,degree_2_university,error
0,000001,000001,Johannes Stoffel,2trde,Johannes,Stoffel,https://www.linkedin.com/in/johannes-stoffel-2...,https://images.crunchbase.com/image/upload/t_c...,,,...,Co-Founder & CEO,Karosso,johannes-stoffel-27389667,4815.0,3914.0,"Executive Master, Digital Innovation and Entre...",ESCP Business School,"Bachelor of Science - BS, Gerneral Management",EBS Universität für Wirtschaft und Recht,
2,000002,000002,Florian Ziesche,36ZERO Vision,Florian,Ziesche,,,,,...,,,,,,,,,,
3,000003,000002,Zeeshan Karamat,36ZERO Vision,Zeeshan,Karamat,https://www.linkedin.com/in/zkaramat/,https://images.crunchbase.com/image/upload/t_c...,,,...,Chief Technology Officer & Co-Founder,36ZERO Vision,zkaramat,6482.0,6479.0,"Masters Computer Science, Artificial Intelligence",Technische Universität München,"Masters Computer Science, Artificial Intelligence",Georgia Institute of Technology,
4,000004,000003,Caroline Steingruber,3Bears Foods,Caroline,Steingruber,,,,,...,,,,,,,,,,
5,000005,000003,Tim Nichols,3Bears Foods,Tim,Nichols,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3013,003137,001508,Sven Peper,Taxy.io,Sven,Peper,https://www.linkedin.com/in/sven-peper,,,,...,CEO & Co-Founder,Taxy.io,sven-peper,1275.0,1174.0,Ingenieurwissenschaften,RWTH Aachen University,,,
3014,003138,001508,Sven Weber,Taxy.io,Sven,Weber,http://www.linkedin.com/in/svenweber,https://images.crunchbase.com/image/upload/t_c...,,,...,Managing Principal,Knightsbridge Advisers LLC,svenweber,5489.0,5477.0,"Master, Physics",Universität Heidelberg,,,
3015,003139,001509,Dr. Gennadi Schechtmann,TRINKKOST GmbH,Dr.,Gennadi Schechtmann,,,,,...,,,,,,,,,,
3016,003140,001509,Timon Ortloff,TRINKKOST GmbH,Timon,Ortloff,,,,,...,,,,,,,,,,


In [40]:
companies
companies.to_csv("companies.csv", index=False)

In [41]:
funding
funding.to_csv("funding.csv", index=False)

In [42]:
investors
investors.to_csv("investors.csv", index=False)

## Next Steps
Explanation: