# Data Preprocessing - Pipeline
Explanation:

## Import Packages
pandas: A library for analyzing, cleaning, and manipulating structured data using DataFrames and Series. \
os: A module for file system operations like creating, deleting, or navigating files and directories. \
glob: A module to find file paths matching patterns (e.g., *.csv) for handling multiple files easily.

In [1]:
import pandas as pd
import os
from glob import glob

## Companies Preprocessing
Explanation:

In [2]:
import pandas as pd

def preprocess_companies(companies_files):
    all_companies = []

    for file in companies_files:
        companies_df = pd.read_csv(file)
        drop_columns = [
            'Organization Name URL', 'Operating Status', 'Stage', 'Headquarters Regions', 'Closed Date',
            'Closed Date Precision', 'Company Type', 'Announced Date Precision', 'Price', 'Price Currency',
            'Price (in USD)', 'Acquisition Terms', 'Money Raised at IPO', 'Money Raised at IPO Currency',
            'Money Raised at IPO (in USD)', 'Valuation at IPO', 'Valuation at IPO Currency',
            'Valuation at IPO (in USD)', 'Stock Symbol', 'Stock Symbol URL', 'Stock Exchange',
            'Founded Date Precision', 'Exit Date Precision', 'Number of Events', 'IPO Date',
            'Transaction Name', 'Postal Code', 'Transaction Name URL', 'Estimated Revenue Range',
            'Headquarters Location', 'Last Funding Amount Currency', 'Last Equity Funding Amount Currency',
            'Total Equity Funding Amount Currency', 'Total Funding Amount Currency', 'Last Funding Amount (in USD)', 'Last Equity Funding Amount', 'Last Equity Funding Amount (in USD)', 'Last Equity Funding Type', 'Total Equity Funding Amount', 'Total Equity Funding Amount (in USD)', 'Funding Status', 'Acquired by URL', 'Acquisition Type',
        ]
        companies_df.drop(columns=drop_columns, inplace=True)

        mean_founders = companies_df['Number of Founders'].mean()
        companies_df['Number of Founders'] = companies_df['Number of Founders'].fillna(mean_founders)
        companies_df = companies_df.dropna(subset=['Number of Employees'])

        for col in ['Industries', 'Industry Groups']:
            companies_df[col] = companies_df[col].fillna('').apply(lambda x: [item.strip() for item in x.split(',')])

        def range_to_mean(value):
            low, high = map(int, value.split('-'))
            return (low + high) / 2

        companies_df['Number of Employees'] = companies_df['Number of Employees'].apply(range_to_mean)

        if 'Announced Date' in companies_df.columns:
            companies_df.rename(columns={'Announced Date': 'Announced Date Acquisition'}, inplace=True)

        for date_col in ['Founded Date', 'Exit Date', 'Last Funding Date', 'Announced Date Acquisition']:
            companies_df[date_col] = pd.to_datetime(companies_df[date_col])

        companies_df['IPO'] = companies_df['IPO Status'].map({'Public': 1, 'Private': 0})
        companies_df.drop(columns=['IPO Status'], inplace=True)

        # Replace NA with 0 for specific columns
        for column in ['Number of Lead Investors', 'Number of Investors', 'Number of Acquisitions']:
            if column in companies_df.columns:
                companies_df[column] = companies_df[column].fillna(0)

        all_companies.append(companies_df)

    merged_companies = pd.concat(all_companies, ignore_index=True)

    merged_companies['ID'] = (merged_companies.index + 1).astype(str).str.zfill(6)

    merged_companies = merged_companies[['ID'] + [col for col in merged_companies.columns if col != 'ID']]

    return merged_companies

## Funding Preprocessing
Explanation:

In [3]:
def preprocess_funding(funding_files, companies_df):

    all_funding = pd.concat([pd.read_csv(file) for file in funding_files], ignore_index=True)

    drop_columns = [
        'Transaction Name', 'Transaction Name URL', 'Organization Name URL', 'Money Raised Currency',
        'Pre-Money Valuation Currency', 'Diversity Spotlight', 'Organization Location', 'Money Raised (in USD)',
        'Pre-Money Valuation (in USD)', 'Organization Description', 'Organization Industries',
        'Organization Website', 'Organization Revenue Range', 'Number of Partner Investors',
        'CB Rank (Funding Round)', 'Total Funding Amount Currency', 'Total Funding Amount (in USD)',
        'Funding Status', 'Equity Only Funding', 'Pre-Money Valuation', 'Total Funding Amount',
        'Number of Funding Rounds', 'Funding Stage'
    ]
    all_funding.drop(columns=drop_columns, inplace=True)

    all_funding = all_funding.merge(
        companies_df[['Organization Name', 'ID']],
        on='Organization Name', how='left'
    )
    all_funding.rename(columns={'ID': 'Company ID'}, inplace=True)
    all_funding['Company ID'] = all_funding['Company ID'].fillna('000000').astype(str).str.zfill(6)

    all_funding['Investor Names'] = all_funding['Investor Names'].fillna('').apply(
        lambda x: [item.strip() for item in x.split(',')]
    )
    all_funding['Announced Date'] = pd.to_datetime(all_funding['Announced Date'])

    all_funding = all_funding.reset_index(drop=True)
    all_funding['Funding ID'] = (all_funding.index + 1).astype(str).str.zfill(6)

    all_funding = all_funding[['Funding ID'] + [col for col in all_funding.columns if col != 'Funding ID']]

    return all_funding

## Investors Preprocessing
Explanation:

In [4]:
def preprocess_investors(investor_files):
    investor_dfs = []
    for file in investor_files:
        df = pd.read_csv(file)
        country = os.path.basename(file).split('-')[1].split('.')[0].upper()
        df['Country'] = country
        investor_dfs.append(df)
    investors = pd.concat(investor_dfs, ignore_index=True)
    investors.drop(columns=['Organization/Person Name URL', 'Investment Stage', 'Regions', 'Founded Date Precision'], inplace=True)

    investors['Investor ID'] = range(1, len(investors) + 1)
    investors['Investor ID'] = investors['Investor ID'].apply(lambda x: str(x).zfill(6))

    cols = ['Investor ID'] + [col for col in investors.columns if col != 'Investor ID']
    investors = investors[cols]

    return investors

## Pipeline Execution
Explanation:

In [5]:
companies_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Companies/*.csv')
funding_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Funding/*.csv')
investor_files = glob('/Users/janlinzner/Library/Mobile Documents/com~apple~CloudDocs/Documents/McGill/1_Lectures/Decision Analytics (MGSC 662)/2_Coding/Exercises/ADA_Coding/Datasets/Investors/*.csv')

companies = preprocess_companies(companies_files)
funding = preprocess_funding(funding_files, companies)
investors = preprocess_investors(investor_files)

## Validation

In [6]:
companies

Unnamed: 0,ID,Organization Name,Industries,Description,CB Rank (Company),Founded Date,Exit Date,Website,Twitter,Facebook,...,Total Funding Amount,Total Funding Amount (in USD),Top 5 Investors,Number of Lead Investors,Number of Investors,Number of Acquisitions,Acquisition Status,Acquired by,Announced Date Acquisition,IPO
0,000001,2trde,"[Automotive, Software]",2trde develops a software solution designed fo...,57123,2017-01-01,NaT,https://www.2trde.com,,,...,7000000.0,7751500.0,"Plug and Play, Aster, Maniv Mobility, Adevinta...",3.0,4.0,0.0,,,NaT,0
1,000002,36ZERO Vision,"[Artificial Intelligence (AI), Computer Vision...",36ZEROVision is an AI-powered visual inspectio...,51326,2019-01-01,NaT,https://36zerovision.com/,,,...,3000000.0,3000000.0,"Alchemist Accelerator, Join Capital",1.0,2.0,0.0,,,NaT,0
2,000003,3Bears Foods,[Food and Beverage],3Bears Foods enable a balanced and delicious b...,275817,2015-01-01,NaT,https://3bears.de/,,https://www.facebook.com/3bearsporridge/,...,,,Freigeist Capital,1.0,1.0,0.0,,,NaT,0
3,000004,3dTrust,"[3D Printing, Manufacturing, Software]",3dTrust helps companies integrate 3D printing ...,134694,2015-01-01,NaT,http://3dtrust.de,https://twitter.com/3dTrust,,...,1000000.0,1119069.0,"Ace Capital Partners, Weare Aerospace",2.0,2.0,0.0,,,NaT,0
4,000005,abaut,"[Analytics, Artificial Intelligence (AI), Cons...",abaut builds a SaaS that enables businesses al...,219525,2017-07-21,NaT,https://abaut.de,,,...,,,,0.0,0.0,0.0,,,NaT,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,001506,SAYM,"[Apps, B2B, B2C, Human Resources, Mobile Apps,...",The SAYM platform for swarm mobility defines t...,166910,2019-01-01,NaT,https://www.saym.io/,https://twitter.com/saym_io,https://www.facebook.com/saymcommute/,...,161000.0,179020.0,"EXIST, Gründerstipendium.NRW",2.0,2.0,0.0,,,NaT,0
1506,001507,SONAH GmbH,"[Apps, Artificial Intelligence (AI), Computer ...",SONAH developed a flexible embedded vision sen...,121752,2016-01-01,NaT,http://www.sonah.tech,,,...,60000.0,71890.0,"AcceliCITY powered by Leading Cities, TK-Solut...",0.0,2.0,0.0,,,NaT,0
1507,001508,Taxy.io,"[FinTech, Legal Tech, Machine Learning, Software]",Taxy.io builds the leading platform for B2B ta...,220816,2019-01-01,NaT,https://www.taxy.io/,https://twitter.com/taxy_io,,...,,,"NRW.BANK, TechVision Fund, 42CAP, Carcharodon ...",2.0,4.0,0.0,,,NaT,0
1508,001509,TRINKKOST GmbH,"[Agriculture, Consumer Goods, Fitness, Food an...",TRINKKOST is a food supplement manufacturing c...,907817,2016-01-01,NaT,http://www.trinkkost.de,https://www.twitter.com/trinkkost,https://www.facebook.com/trinkkost/,...,,,"ProSiebenSat.1 Accelerator, Business Angels Ag...",1.0,2.0,0.0,,,NaT,0


In [7]:
funding

Unnamed: 0,Funding ID,Organization Name,Funding Type,Money Raised,Announced Date,Lead Investors,Number of Investors,Investor Names,Company ID
0,000001,4stop,Series A,2500000.0,2019-05-15,Ventech,1.0,[Ventech],000000
1,000002,aiconix GmbH,Convertible Note,,2019-06-17,,,[],001193
2,000003,aiconix GmbH,Pre-Seed,300000.0,2019-08-28,,,[],001193
3,000004,AutLay,Seed,,2019-05-22,Crew Ventures,1.0,[Crew Ventures],001196
4,000005,Buynomics,Pre-Seed,,2019-09-01,DvH Ventures,2.0,"[DvH Ventures, Tomahawk.VC]",001205
...,...,...,...,...,...,...,...,...,...
6863,006864,VAMOS.ai,Seed,,2020-01-01,DDG AG,1.0,[DDG AG],000000
6864,006865,WeProfit,Pre-Seed,272000.0,2021-05-28,,5.0,"[Ara Abrahamyan, Armen Kocharyan, Clemens Boll...",000298
6865,006866,WindStar Medical GmbH,Seed,,2020-12-07,Project A Ventures,1.0,[Project A Ventures],000000
6866,006867,WorkMentality Foundation,Seed,,2023-03-01,,1.0,[Deutsche Bank],000000


In [8]:
investors

Unnamed: 0,Investor ID,Organization/Person Name,Investor Type,Number of Investments,Number of Exits,Location,Description,Founded Date,Website,LinkedIn,Number of Portfolio Organizations,Number of Lead Investments,Number of Exits (IPO),Country
0,000001,Christian Edler,"Individual/Angel, Investment Partner",57,10.0,"Berlin, Berlin, Germany",Christian O. Edler is an investor and entrepre...,,http://www.christianedler.com,https://www.linkedin.com/in/christianedler/,50,,,GER
1,000002,Ralf Dummel,Individual/Angel,38,,"Hamburg, Hamburg, Germany","Ralf Dummel, born on 02.12.1966 in Bad Segeber...",,,,38,13.0,,GER
2,000003,Carsten Maschmeyer,"Individual/Angel, Investment Partner",36,3.0,"Hanover, Niedersachsen, Germany",Carsten Maschmeyer is a successful entrepreneu...,,,https://www.linkedin.com/in/carsten-maschmeyer/,34,8.0,,GER
3,000004,Kai Hansen,"Individual/Angel, Investment Partner",36,4.0,"Berlin, Berlin, Germany",business angel // entrepreneur // founder lief...,,,https://www.linkedin.com/in/hansenkai/en,25,,,GER
4,000005,Chris Schagen,Individual/Angel,33,4.0,"Berlin, Berlin, Germany",I help developer tech entrepreneurs beginning ...,,,https://www.linkedin.com/in/christianschagen/,29,,,GER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8209,008210,Manchester Technology Fund,Venture Capital,3,1.0,"Manchester, Manchester, United Kingdom","Manchester Technology Fund, a Manchester-based...",1999-01-01,http://www.mantechfund.com,,3,,,UK
8210,008211,British Smaller Technology Companies VCT 2,Venture Capital,3,1.0,"Leeds, Leeds, United Kingdom","British Smaller Technology Companies VCT 2, a ...",,,,3,,,UK
8211,008212,Elwin Capital Partners,Venture Capital,3,2.0,"London, England, United Kingdom",Elwin Capital Partners is out of business. It ...,2000-01-01,http://www.elwincapital.com/,,3,1.0,,UK
8212,008213,Puma Investments,Venture Capital,3,,"London, England, United Kingdom",Puma Investments is a investment firm that pro...,2012-01-01,https://www.pumainvestments.co.uk/,https://www.linkedin.com/company/puma-investme...,2,2.0,,UK


## Next Steps
Explanation: