# Data Preprocessing - Pipeline
Explanation:

## Import Packages
pandas: A library for analyzing, cleaning, and manipulating structured data using DataFrames and Series. \
os: A module for file system operations like creating, deleting, or navigating files and directories. \
glob: A module to find file paths matching patterns (e.g., *.csv) for handling multiple files easily.

In [19]:
import pandas as pd
import os
from glob import glob

## Companies Preprocessing
Explanation:

In [20]:
def preprocess_companies(companies_files):
    all_companies = []

    for file in companies_files:
        companies_df = pd.read_csv(file)
        drop_columns = [
            'Total Funding Amount', 'Organization Name URL', 'Operating Status', 'Stage', 'Headquarters Regions', 'Closed Date',
            'Closed Date Precision', 'Company Type', 'Announced Date Precision', 'Price', 'Price Currency',
            'Price (in USD)', 'Acquisition Terms', 'Money Raised at IPO', 'Money Raised at IPO Currency',
            'Money Raised at IPO (in USD)', 'Valuation at IPO', 'Valuation at IPO Currency',
            'Valuation at IPO (in USD)', 'Stock Symbol', 'Stock Symbol URL', 'Stock Exchange',
            'Founded Date Precision', 'Exit Date Precision', 'Number of Events', 'IPO Date',
            'Transaction Name', 'Postal Code', 'Transaction Name URL', 'Estimated Revenue Range',
            'Headquarters Location', 'Last Funding Amount Currency', 'Last Equity Funding Amount Currency',
            'Total Equity Funding Amount Currency', 'Total Funding Amount Currency', 'Last Funding Amount (in USD)', 'Last Equity Funding Amount', 'Last Equity Funding Amount (in USD)', 'Last Equity Funding Type', 'Total Equity Funding Amount', 'Total Equity Funding Amount (in USD)', 'Funding Status', 'Acquired by URL', 'Acquisition Type',
        ]
        companies_df.drop(columns=drop_columns, inplace=True)

        mean_founders = companies_df['Number of Founders'].mean()
        companies_df['Number of Founders'] = companies_df['Number of Founders'].fillna(mean_founders)
        companies_df = companies_df.dropna(subset=['Number of Employees'])

        for col in ['Industries', 'Industry Groups']:
            companies_df[col] = companies_df[col].fillna('').apply(lambda x: [item.strip() for item in x.split(',')])

        def range_to_mean(value):
            low, high = map(int, value.split('-'))
            return (low + high) / 2

        companies_df['Number of Employees'] = companies_df['Number of Employees'].apply(range_to_mean)

        if 'Announced Date' in companies_df.columns:
            companies_df.rename(columns={'Announced Date': 'Announced Date Acquisition'}, inplace=True)

        for date_col in ['Founded Date', 'Exit Date', 'Last Funding Date', 'Announced Date Acquisition']:
            companies_df[date_col] = pd.to_datetime(companies_df[date_col])

        companies_df['IPO'] = companies_df['IPO Status'].map({'Public': 1, 'Private': 0})
        companies_df.drop(columns=['IPO Status'], inplace=True)

        for column in ['Number of Lead Investors', 'Number of Investors', 'Number of Acquisitions']:
            if column in companies_df.columns:
                companies_df[column] = companies_df[column].fillna(0)

        all_companies.append(companies_df)

    merged_companies = pd.concat(all_companies, ignore_index=True)

    merged_companies['ID'] = (merged_companies.index + 1).astype(str).str.zfill(6)

    merged_companies = merged_companies[['ID'] + [col for col in merged_companies.columns if col != 'ID']]

    return merged_companies

## Funding Preprocessing
Explanation:

In [21]:
def preprocess_funding(funding_files, companies_df):

    all_funding = pd.concat([pd.read_csv(file) for file in funding_files], ignore_index=True)

    drop_columns = [
        'Transaction Name', 'Transaction Name URL', 'Organization Name URL', 'Money Raised Currency',
        'Pre-Money Valuation Currency', 'Diversity Spotlight', 'Organization Location', 'Money Raised (in USD)',
        'Pre-Money Valuation (in USD)', 'Organization Description', 'Organization Industries',
        'Organization Website', 'Organization Revenue Range', 'Number of Partner Investors',
        'CB Rank (Funding Round)', 'Total Funding Amount Currency', 'Total Funding Amount (in USD)',
        'Funding Status', 'Equity Only Funding', 'Pre-Money Valuation', 'Total Funding Amount',
        'Number of Funding Rounds', 'Funding Stage'
    ]
    all_funding.drop(columns=drop_columns, inplace=True)

    all_funding = all_funding.merge(
        companies_df[['Organization Name', 'ID']],
        on='Organization Name', how='left'
    )
    all_funding.rename(columns={'ID': 'Company ID'}, inplace=True)
    all_funding['Company ID'] = all_funding['Company ID'].fillna('000000').astype(str).str.zfill(6)

    all_funding['Investor Names'] = all_funding['Investor Names'].fillna('').apply(
        lambda x: [item.strip() for item in x.split(',')]
    )
    all_funding['Announced Date'] = pd.to_datetime(all_funding['Announced Date'])

    all_funding = all_funding.reset_index(drop=True)
    all_funding['Funding ID'] = (all_funding.index + 1).astype(str).str.zfill(6)

    all_funding = all_funding[['Funding ID'] + [col for col in all_funding.columns if col != 'Funding ID']]

    return all_funding

## Investors Preprocessing
Explanation:

In [22]:
def preprocess_investors(investor_files):
    investor_dfs = []
    for file in investor_files:
        df = pd.read_csv(file)
        country = os.path.basename(file).split('-')[1].split('.')[0].upper()
        df['Country'] = country
        investor_dfs.append(df)
    investors = pd.concat(investor_dfs, ignore_index=True)
    investors.drop(columns=['Organization/Person Name URL', 'Investment Stage', 'Regions', 'Founded Date Precision'], inplace=True)

    investors['Investor ID'] = range(1, len(investors) + 1)
    investors['Investor ID'] = investors['Investor ID'].apply(lambda x: str(x).zfill(6))

    cols = ['Investor ID'] + [col for col in investors.columns if col != 'Investor ID']
    investors = investors[cols]

    return investors

## Founders Preprocessing

In [23]:
def process_founders(companies, people):
    required_columns = ['first_name', 'last_name', 'linkedin_url', 'logo_url', 'facebook_url',
                        'twitter_url', 'city', 'region', 'country_code', 'featured_job_title',
                        'featured_job_organization_name']
    missing_columns = [col for col in required_columns if col not in people.columns]
    if missing_columns:
        raise ValueError(f"Missing columns in 'people': {missing_columns}")

    temp_companies = companies.copy()

    temp_companies['Founders'] = temp_companies['Founders'].fillna('').astype(str)
    temp_companies['Founders'] = temp_companies['Founders'].apply(lambda x: [item.strip() for item in x.split(',') if item.strip()])

    founders_table = temp_companies.explode('Founders', ignore_index=True)

    founders = founders_table[['ID', 'Founders', 'Organization Name']].rename(
        columns={'ID': 'Company ID', 'Founders': 'Founder Name'}
    )

    founders['Founder ID'] = founders.index + 1
    founders['Founder ID'] = founders['Founder ID'].astype(str).str.zfill(6)

    founders = founders[['Founder ID', 'Company ID', 'Founder Name', 'Organization Name']]
    founders = founders.dropna(subset=['Founder Name'])
    founders[['first_name', 'last_name']] = founders['Founder Name'].str.split(' ', n=1, expand=True)

    founders['linkedin_url'] = None

    string_columns = ['logo_url', 'facebook_url', 'twitter_url', 'city', 'region',
                      'country_code', 'featured_job_title', 'featured_job_organization_name']
    for col in string_columns:
        if col not in founders.columns:
            founders[col] = None
        founders[col] = founders[col].astype('str')

    duplicates = []

    for idx, row in founders.iterrows():
        matches = people[
            (people['first_name'] == row['first_name']) &
            (people['last_name'] == row['last_name'])
            ]
        if len(matches) == 0:
            continue
        elif len(matches) == 1:
            match = matches.iloc[0]
            founders.at[idx, 'linkedin_url'] = match['linkedin_url']
            for col in string_columns:
                founders.at[idx, col] = match[col]
        else:
            duplicates.append({
                'founder_id': row['Founder ID'],
                'first_name': row['first_name'],
                'last_name': row['last_name'],
                'matches': matches.to_dict('records')
            })

    print("\nResults with possible duplicates:")
    print(founders)

    print("\nList of Duplicates:")
    for duplicate in duplicates:
        print(f"Founder ID: {duplicate['founder_id']}, Name: {duplicate['first_name']} {duplicate['last_name']}")
        print("Matches:")
        for match in duplicate['matches']:
            print(f"  - LinkedIn: {match['linkedin_url']}")

    founders.to_csv("founders_with_links.csv", index=False)
    print("\nFile saved successfully: founders_with_links.csv")

    return founders

## Pipeline Execution
Explanation:

In [24]:
companies_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Companies/*.csv')
funding_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Funding/*.csv')
investor_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Investors/*.csv')

companies = preprocess_companies(companies_files)
funding = preprocess_funding(funding_files, companies)
investors = preprocess_investors(investor_files)

people = pd.read_csv("/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/People/people_CB.csv")

founders = process_founders(companies, people)


Results with possible duplicates:
     Founder ID Company ID             Founder Name Organization Name  \
0        000001     000001         Johannes Stoffel             2trde   
1        000002     000002          Florian Ziesche     36ZERO Vision   
2        000003     000002          Zeeshan Karamat     36ZERO Vision   
3        000004     000003     Caroline Steingruber      3Bears Foods   
4        000005     000003              Tim Nichols      3Bears Foods   
...         ...        ...                      ...               ...   
3136     003137     001508               Sven Peper           Taxy.io   
3137     003138     001508               Sven Weber           Taxy.io   
3138     003139     001509  Dr. Gennadi Schechtmann    TRINKKOST GmbH   
3139     003140     001509            Timon Ortloff    TRINKKOST GmbH   
3140     003141     001510           Tobias Kollewe   worqs Coworking   

     first_name            last_name  \
0      Johannes              Stoffel   
1       

## Validation

In [25]:
companies
companies.to_csv("companies.csv", index=False)

In [26]:
funding
funding.to_csv("funding.csv", index=False)

In [27]:
investors
investors.to_csv("investors.csv", index=False)

## Next Steps
Explanation: