Download dataset from:
https://www.kaggle.com/firmai/startup-founder-valuations-dataset

In [None]:
!gdown --id 1u0By4PjWybB8W8X_dcz92Ro3Aj6UzPQG

Downloading...
From: https://drive.google.com/uc?id=1u0By4PjWybB8W8X_dcz92Ro3Aj6UzPQG
To: /content/Startup_Valuation.zip
  0% 0.00/55.9k [00:00<?, ?B/s]100% 55.9k/55.9k [00:00<00:00, 45.9MB/s]


Unzip Downloaded Dataset

In [None]:
!unzip Startup_Valuation.zip

Archive:  Startup_Valuation.zip
replace startup_founder_chars.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: startup_founder_chars.csv  
  inflating: startup_valuations.csv  


#Pipeline
1. Collect Dataset
2. Clean Data
3. Extract Features
4. Split Training and test
5. Train
6. Evaluate
7. Create Pipeline

##1. Collect Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
valuation_df = pd.read_csv('startup_valuations.csv', encoding='cp1252')

In [None]:
valuation_df.head()

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase
0,Chad Hurley,YouTube,0,14000000.0,88400000.0,6.314285714
1,Gwyneth Paltrow,Goop Inc.,0,40000000.0,65000000.0,1.625
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,0.0,#VALUE!
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539
4,Matt Mullenweg,Automattic,0,8650000.0,238590000.0,27.58265896


In [None]:
founder_df = pd.read_csv('startup_founder_chars.csv', encoding='cp1252')

In [None]:
founder_df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm


In [None]:
def collect_data():
  valuation_df = pd.read_csv('startup_valuations.csv', encoding='cp1252')
  founder_df = pd.read_csv('startup_founder_chars.csv', encoding='cp1252')
  return valuation_df, founder_df


##2. Clean Data

In [None]:
valuation_df.head(5)

Unnamed: 0,Full Name,Primary Company,Seed Valuation,A Valuation,B Valuation,Valuation Increase
0,Chad Hurley,YouTube,0,14000000.0,88400000.0,6.314285714
1,Gwyneth Paltrow,Goop Inc.,0,40000000.0,65000000.0,1.625
2,Jason Calacanis,Inside.com,"$7,000,000.00",11250000.0,0.0,#VALUE!
3,Tony Fadell,Nest Labs,"$2,700,000.00",49210000.0,288120000.0,5.854907539
4,Matt Mullenweg,Automattic,0,8650000.0,238590000.0,27.58265896


In [None]:
valuation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1469 entries, 0 to 1468
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Full Name           1469 non-null   object
 1   Primary Company     1469 non-null   object
 2   Seed Valuation      1469 non-null   object
 3   A Valuation         1469 non-null   object
 4   B Valuation         1469 non-null   object
 5   Valuation Increase  1469 non-null   object
dtypes: object(6)
memory usage: 69.0+ KB


In [None]:
valuation_df['Seed Valuation']

0                    0
1                    0
2       $7,000,000.00 
3       $2,700,000.00 
4                    0
             ...      
1464                 0
1465                 0
1466                 0
1467                 0
1468                 0
Name: Seed Valuation, Length: 1469, dtype: object

In [None]:
valuation_df['Seed Valuation'] \
            .str.replace(',', '') \
            .str.extract('(\d+)') \
            .astype(float)

Unnamed: 0,0
0,0.0
1,0.0
2,7000000.0
3,2700000.0
4,0.0
...,...
1464,0.0
1465,0.0
1466,0.0
1467,0.0


In [None]:
founder_df.sample(5)

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
203,Jim Fowler,"Owler, Inc.",1.0,0.0,University of Colorado Boulder,International Relations,BA,,,,0.0,7.0,0.0,0.0,0.0,1.0,0,0,https://www.crunchbase.com/person/jim-fowler,http://www.linkedin.com/pub/jim-fowler/0/34/193
282,Paul Doersch,Kespry Inc.,0.0,0.0,Stanford University,Computer Science,BS,,,,0.0,2.0,0.0,0.0,0.0,0.0,1,0,https://www.crunchbase.com/person/paul-doersch,https://www.linkedin.com/pub/paul-doersch/9/b9...
37,Noah Kraft,Doppler Labs,0.0,1.0,Brown University,International Relations,BA,University of California Los Angeles,,,1.0,7.0,0.0,1.0,0.0,0.0,0,1,https://www.crunchbase.com/person/noah-kraft,https://www.linkedin.com/in/noahkraft
176,Robbie Cape,98point6,1.0,0.0,Princeton University,Operations Research,BE,,,,0.0,0.0,0.0,0.0,0.0,0.0,0,1,https://www.crunchbase.com/person/robbie-cape,https://www.linkedin.com/pub/robbie-cape/0/81b...
267,Jonathan Mendez,Yieldbot,3.0,1.0,University of Southern California,,,,,,0.0,2.0,0.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/jonathan-mendez,http://www.linkedin.com/in/jonathanmendez


In [None]:
founder_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574 entries, 0 to 573
Data columns (total 20 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Full Name                                    574 non-null    object 
 1   Primary Company                              574 non-null    object 
 2   Previous startups?                           574 non-null    float64
 3   Consulting before start-up                   574 non-null    float64
 4   Standardized University                      505 non-null    object 
 5   Standardized Major                           444 non-null    object 
 6   Degree Type                                  432 non-null    object 
 7   Standardized Graduate Institution            253 non-null    object 
 8   Standardized Graduate Studies                258 non-null    object 
 9   Graduate Diploma                             258 non-null    object 
 10  Ev

In [None]:
founder_df.fillna('None')

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0,4.0,1.0,1.0,0,0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0,12.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0,4.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0,18.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0,15.0,1.0,0.0,0,0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,Cliff Moon,Opsee,1.0,0.0,University of Delaware,Computer Science,BS,,,,0,6.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/cliff-moon,http://www.linkedin.com/pub/cliff-moon/4/44b/283
570,Hesaam Esfandyarpour,Genapsys,1.0,0.0,Sharif University of Technology,Electrical Engineering,BS,Stanford University,Medicine,MS,1,6.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/hesaam-esfan...,https://www.linkedin.com/pub/hesaam-esfandyarp...
571,Marc Ruxin,TastemakerX,0.0,0.0,Hamilton College,History,BA,,Business,MBA,0,15.0,0.0,0.0,0,1,0,0,https://www.crunchbase.com/person/marc-ruxin,http://www.linkedin.com/in/marcruxin
572,Radu B. Rusu,Fyusion,0.0,0.0,,,,,Computer Science,PhD,1,10.0,0.0,0.0,0,0,0,0,https://www.crunchbase.com/person/radu-b--rusu,https://www.linkedin.com/in/radubogdanrusu


In [None]:
def clean_data(valuation_df, founder_df):
  # clean valuation dataframe
  val_df = valuation_df.copy()
  val_df['Seed Valuation'] = val_df['Seed Valuation'] \
                                        .str.replace(',', '') \
                                        .str.extract('(\d+)') \
                                        .astype(float)
  val_df['A Valuation'] = val_df['A Valuation'] \
                                        .str.replace(',', '') \
                                        .str.extract('(\d+)') \
                                        .astype(float)
  val_df['B Valuation'] = val_df['B Valuation'] \
                                        .str.replace(',', '') \
                                        .str.extract('(\d+)') \
                                        .astype(float)
  val_df['Valuation'] = val_df[['Seed Valuation','A Valuation','B Valuation']] \
                                        .max(axis=1) \
                                        .astype(float)
  # exclude no valuation data
  val_df = val_df[val_df['Valuation'] > 0]

  # clean founder dataframe
  found_df = founder_df.copy()

  col = ['Standardized University','Standardized Major','Degree Type',\
         'Standardized Graduate Institution','Standardized Graduate Studies',\
         'Graduate Diploma']
  found_df[col] = found_df[col].fillna('None')
  found_df = found_df.fillna(0)

  # merge dataset
  clean_df = pd.merge(found_df, val_df, on=['Full Name','Primary Company'], how='inner')

  return clean_df

In [None]:
clean_df = clean_data(valuation_df, founder_df)

In [None]:
clean_df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn,Seed Valuation,A Valuation,B Valuation,Valuation Increase,Valuation
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711,0.0,14000000.0,88400000.0,6.314285714,88400000.0
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...,0.0,40000000.0,65000000.0,1.625,65000000.0
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis,7000000.0,11250000.0,0.0,#VALUE!,11250000.0
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380,2700000.0,49210000.0,288120000.0,5.854907539,288120000.0
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm,0.0,8650000.0,238590000.0,27.58265896,238590000.0


In [None]:
clean_df.isna().sum()

Full Name                                      0
Primary Company                                0
Previous startups?                             0
Consulting before start-up                     0
Standardized University                        0
Standardized Major                             0
Degree Type                                    0
Standardized Graduate Institution              0
Standardized Graduate Studies                  0
Graduate Diploma                               0
Ever served as TA/Teacher/Professor/Mentor?    0
Years of Employment                            0
Worked as product manager/director/head/VP?    0
Worked at Google?                              0
Worked at Microsoft?                           0
Worked in Sales?                               0
Stanford or Berkeley                           0
Ivy League                                     0
Crunchbase                                     0
LinkedIn                                       0
Seed Valuation      

##3. Extract Features

In [None]:
clean_df.head()

Unnamed: 0,Full Name,Primary Company,Previous startups?,Consulting before start-up,Standardized University,Standardized Major,Degree Type,Standardized Graduate Institution,Standardized Graduate Studies,Graduate Diploma,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Crunchbase,LinkedIn,Seed Valuation,A Valuation,B Valuation,Valuation Increase,Valuation
0,Chad Hurley,YouTube,0.0,0.0,Indiana University of Pennsylvania,Design,,,,,0.0,4.0,1.0,1.0,0.0,0.0,0,0,https://www.crunchbase.com/person/chad-hurley,http://www.linkedin.com/profile/view?id=5711,0.0,14000000.0,88400000.0,6.314285714,88400000.0
1,Gwyneth Paltrow,Goop Inc.,0.0,0.0,,,,,,,0.0,12.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/gwyneth-paltrow,https://www.linkedin.com/in/gwyneth-paltrow-48...,0.0,40000000.0,65000000.0,1.625,65000000.0
2,Jason Calacanis,Inside.com,3.0,0.0,Fordham University,Psychology,BA,,,,0.0,4.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/jason-calacanis,http://www.linkedin.com/in/jasoncalacanis,7000000.0,11250000.0,0.0,#VALUE!,11250000.0
3,Tony Fadell,Nest Labs,1.0,0.0,University of Michigan,Computer Science,BS,,,,0.0,18.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/tony-fadell,http://www.linkedin.com/pub/tony-fadell/0/1/380,2700000.0,49210000.0,288120000.0,5.854907539,288120000.0
4,Matt Mullenweg,Automattic,1.0,0.0,University of Houston,,,,,,0.0,15.0,1.0,0.0,0.0,0.0,0,0,https://www.crunchbase.com/person/matt-mullenweg,http://www.linkedin.com/in/mattm,0.0,8650000.0,238590000.0,27.58265896,238590000.0


In [None]:
def extract_feature(df, is_training=True):
  feat_df = df.copy()
  label_df = []
  if 'Valuation' in feat_df.columns:
    label_df = feat_df['Valuation']

  # drop unused columns
  if is_training:
    feat_df = feat_df.drop(['Full Name','Primary Company','Crunchbase','LinkedIn', \
                          'Standardized University', 'Standardized Major', \
                          'Standardized Graduate Institution','Standardized Graduate Studies', \
                          'Seed Valuation','A Valuation','B Valuation','Valuation Increase', \
                          'Valuation'],axis=1)
  
  col_names = ['Degree Type', 'Graduate Diploma']

  # One-Hot Encoding
  dummies_df = pd.get_dummies(feat_df[col_names])

  # Merge One-Hot Encoding
  feat_df = pd.concat([feat_df, dummies_df], axis=1)

  # Drop unused columns (One-Hot Encoding)
  feat_df = feat_df.drop(col_names, axis=1)

  for col in feat_df.columns:
    feat_df[col] = pd.to_numeric(feat_df[col], errors='coerce')

  return feat_df, label_df

In [None]:
feat_df, label_df = extract_feature(clean_df)

In [None]:
feat_df

Unnamed: 0,Previous startups?,Consulting before start-up,Ever served as TA/Teacher/Professor/Mentor?,Years of Employment,Worked as product manager/director/head/VP?,Worked at Google?,Worked at Microsoft?,Worked in Sales?,Stanford or Berkeley,Ivy League,Degree Type_AA,Degree Type_BA,Degree Type_BE,Degree Type_BS,Degree Type_Drop,Degree Type_JD,Degree Type_LLB,Degree Type_MA,Degree Type_MS,Degree Type_None,Degree Type_PhD,Graduate Diploma_AAS,Graduate Diploma_Associate Instructor,Graduate Diploma_BA,Graduate Diploma_BPhil,Graduate Diploma_BSc,Graduate Diploma_Executive Education,Graduate Diploma_JD,Graduate Diploma_MA,Graduate Diploma_MBA,Graduate Diploma_MBI,Graduate Diploma_MD,Graduate Diploma_MEng,Graduate Diploma_MS,Graduate Diploma_None,Graduate Diploma_PhD
0,0.0,0.0,0.0,4.0,1.0,1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0.0,0.0,0.0,12.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,3.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,1.0,0.0,0.0,18.0,1.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,1.0,0.0,0.0,15.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
553,1.0,0.0,1.0,6.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
554,0.0,0.0,0.0,15.0,0.0,0.0,0.0,1.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
555,0.0,0.0,1.0,10.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


##4. Train & test split

In [None]:
len(feat_df)

557

In [None]:
from sklearn.model_selection import train_test_split
train_feat, test_feat, train_label, test_label = \
          train_test_split(feat_df, label_df, test_size=0.2, random_state=42)

In [None]:
len(train_feat)

445

In [None]:
len(test_feat)

112

In [None]:
def split_train_test(feat, label):
  train_feat, test_feat, train_label, test_label = \
          train_test_split(feat, label, test_size=0.2, random_state=42)
  return train_feat, test_feat, train_label, test_label

In [None]:
train_feat, test_feat, train_label, test_label = \
        split_train_test(feat_df, label_df)

##5. Train

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn import tree

In [None]:
def train_model(feat, label):
  # random forrest
  # model = RandomForestRegressor(max_depth=1000, random_state=2020)
  
  # Gradient Boosting
  model = GradientBoostingRegressor(random_state=2020)

  # Decision Tree
  # model = tree.DecisionTreeRegressor(max_depth=1000, random_state=2020)
  
  model.fit(feat, label)
  return model

In [None]:
model = train_model(train_feat, train_label)

##6. Evaluation

In [None]:
model

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=2020, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
predict = model.predict(test_feat)

In [None]:
predict

array([ 7.02697040e+07,  1.12988431e+07,  1.56214224e+08,  7.62648328e+07,
        4.76868218e+07,  6.69279389e+07,  9.27093964e+07,  1.22025681e+08,
        4.19361058e+07,  1.53451952e+08,  1.55157584e+08,  3.79126240e+07,
        6.63691632e+07,  1.00502579e+08,  5.12786729e+07,  4.98649334e+07,
        7.51127936e+07,  1.30362163e+08,  3.53954914e+07,  9.11702650e+07,
        1.48524183e+08,  1.39320585e+08,  1.17721430e+08,  1.79701870e+08,
        2.10134061e+07,  7.62431383e+07,  3.91835107e+07,  2.44304116e+08,
        8.70372414e+07,  8.00152509e+07, -1.24669230e+07,  1.05599619e+08,
        7.03178687e+07,  1.02086719e+08,  6.02091933e+07,  2.28184352e+08,
        1.24100096e+08,  6.82711962e+07,  4.03066389e+07,  1.56310470e+08,
        8.53023950e+07,  1.46281877e+08,  7.91614955e+07,  1.26242810e+08,
        4.26612764e+07,  7.38760510e+07,  4.97858981e+07,  1.40956854e+08,
        2.26855521e+08,  1.16642829e+08,  7.35351396e+07,  2.70025871e+08,
        1.12884074e+08,  

In [None]:
def eval_acc(prediction, actual):
  acc = sum(prediction == actual) / len(actual)
  return acc

In [None]:
eval_acc(predict,test_label)

0.0

In [None]:
from sklearn.metrics import mean_squared_error

def eval_rmse(predict, actual):
  return np.sqrt(mean_squared_error(predict, actual))

def rmse(predict,test_label):
  diff = predict - test_label
  return np.sqrt(sum(diff**2)/len(diff))

In [None]:
eval_rmse(predict, test_label)

182261705.91390085

In [None]:
rmse(predict, test_label)

182261705.91390088

##7. Pipeline

In [None]:
def pipeline():
  # collect data
  valuation_df, founder_df = collect_data()

  # clean data
  clean_df = clean_data(valuation_df, founder_df)

  # extract features
  train_df, test_df = extract_feature(clean_df)

  # split training and test
  train_feat, test_feat, train_label, test_label = \
        split_train_test(feat_df, label_df)

  # Train
  model = train_model(train_feat, train_label)

  # Evaluate
  predict = model.predict(test_feat)
  print('error score', eval_rmse(predict, test_label))

In [None]:
pipeline()

error score 182261705.91390085


## 8. Deployment (Extra)

In [None]:
#@title Estimate your startup valuation

answer_list = []
Previous_startups =  0#@param {type:"number"}
Consulting_before_startup = True #@param {type:"boolean"}
Degree_Type = 'BA' #@param ['None', 'BA', 'BS', 'BE', 'Drop', 'MS', 'AA', 'MA', 'LLB', 'JD', 'PhD'] {type:"string"}
Graduate_Diploma = 'MS' #@param ['None', 'BA', 'BS', 'BE', 'Drop', 'MS', 'AA', 'MA', 'LLB', 'JD', 'PhD'] {type:"string"}
is_teacher = False #@param {type:"boolean"}
Years_of_Employment =  1#@param {type:"number"}
is_manager = True #@param {type:"boolean"}
Worked_at_Google = True #@param {type:"boolean"}
Worked_at_Microsoft = False #@param {type:"boolean"}
Worked_in_Sales = True #@param {type:"boolean"}
Stanford_or_Berkeley = False #@param {type:"boolean"}
Ivy_League = True #@param {type:"boolean"}

col = ['Previous startups?', 'Consulting before start-up', 'Degree Type', \
       'Graduate Diploma', 'Ever served as TA/Teacher/Professor/Mentor?', \
       'Years of Employment', 'Worked as product manager/director/head/VP?', \
       'Worked at Google?', 'Worked at Microsoft?', 'Worked in Sales?', \
       'Stanford or Berkeley', 'Ivy League']
predict_df = pd.DataFrame([[Previous_startups,Consulting_before_startup,Degree_Type, \
                           Graduate_Diploma,is_teacher,Years_of_Employment,is_manager, \
                           Worked_at_Google,Worked_at_Microsoft,Worked_in_Sales, \
                           Stanford_or_Berkeley,Ivy_League]],columns=col)

test_df = pd.concat([predict_df, clean_df[col]], axis=0)
test, _ = extract_feature(test_df, is_training=False)
print("your startup valuation is: ", model.predict(test.iloc[:1])[0])

your startup valuation is:  85721743.32257985
