# Startup-Acquisition-Status-Prediction with pipeline

### Data preprocessing

In [1]:
#import necessary libraries  
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
warnings.filterwarnings('ignore')

In [2]:
company = pd.read_csv("companies.csv")

In [3]:
company.shape

(196553, 44)

In [4]:
# to display all columns
pd.set_option('display.max_columns', None)

In [5]:
# Droping  irrelevant and redundant information
company.drop(['region','city','state_code'], axis=1,inplace=True)
company.drop(['id', 'Unnamed: 0.1', 'entity_type', 'entity_id', 'parent_id', 'created_by', 'created_at', 'updated_at'], axis=1,inplace=True)
company.drop([ 'domain', 'homepage_url', 'twitter_username', 'logo_url', 'logo_width', 'logo_height', 'short_description', 'description', 'overview','tag_list', 'name', 'normalized_name', 'permalink', 'invested_companies'], axis=1,inplace=True)

In [6]:
# check for duplicate values
company.duplicated().any()

True

In [7]:
# check number of duplicate values
company.duplicated().sum()

87089

In [8]:
# Let's delete all the duplicate values
company.drop_duplicates(inplace=True)

In [9]:
# check if any left
company.duplicated().any()

False

In [10]:
# # Since we can see it has more than 96% of null values, it would not make sense to impute these data. So, lets drop it.
company.drop(['first_investment_at','last_investment_at','investment_rounds','ROI'], axis=1,inplace=True)

In [11]:
#lets check number of missing values in each rows
company.isna().sum()

category_code          12230
status                     0
founded_at             26913
closed_at             106845
country_code           24870
first_funding_at       77992
last_funding_at        77992
funding_rounds         77793
funding_total_usd      81602
first_milestone_at     53353
last_milestone_at      53353
milestones             53353
relationships          34403
lat                    28363
lng                    28363
dtype: int64

In [12]:
company.shape

(109464, 15)

In [13]:
#Delete instances with missing values for 'status', 'country_code', 'category_code' and 'founded_at'.
company.dropna(subset=['status', 'country_code', 'category_code','founded_at'],inplace=True)

In [14]:
company.shape

(63585, 15)

#### Handling outliers by IQR method

In [15]:
# For funding_total_usd
#Type your code here!
Q1_FTU = company.funding_total_usd.quantile(0.25)
Q3_FTU = company.funding_total_usd.quantile(0.75)
Q1_FTU, Q3_FTU
IQR_FTU = Q3_FTU - Q1_FTU
print(f"IQR of funding_total_usd is: {IQR_FTU}\n")

# For funding_rounds
#Type your code here!
Q1_FR = company.funding_rounds.quantile(0.25)
Q3_FR = company.funding_rounds.quantile(0.75)
Q1_FR, Q3_FR
IQR_FR = Q3_FR - Q1_FR
print(f"IQR of funding_rounds is: {IQR_FR}")

IQR of funding_total_usd is: 11488962.0

IQR of funding_rounds is: 1.0


In [16]:
# For funding_total_usd
lower_limit_FTU = Q1_FTU - 1.5*IQR_FTU
upper_limit_FTU = Q3_FTU + 1.5*IQR_FTU
print("funding_total_usd")
print(f"lower limit is: {lower_limit_FTU}")
print(f"upper limit is: {upper_limit_FTU}\n")


# For funding_rounds
lower_limit_FR = Q1_FR - 1.5*IQR_FR
upper_limit_FR = Q3_FR + 1.5*IQR_FR
print("funding_total_usd")
print(f"lower limit is: {lower_limit_FR}")
print(f"upper limit is: {upper_limit_FR}")

funding_total_usd
lower limit is: -16722405.0
upper limit is: 29233443.0

funding_total_usd
lower limit is: -0.5
upper limit is: 3.5


In [17]:
# For funding_total_usd
funding_total_usd_outliers=company[(company.funding_total_usd<lower_limit_FTU)|(company.funding_total_usd>upper_limit_FTU)]
print(funding_total_usd_outliers.shape)

# For funding_rounds
funding_rounds_outliers=company[(company.funding_rounds<lower_limit_FR)|(company.funding_rounds>upper_limit_FR)]
print(funding_rounds_outliers.shape)

(2561, 15)
(2182, 15)


In [18]:
# For funding_total_usd
company.drop(company[(company.funding_total_usd<lower_limit_FTU)|(company.funding_total_usd>upper_limit_FTU)].index,inplace=True)


# For funding_rounds
company.drop(company[(company.funding_rounds<lower_limit_FR)|(company.funding_rounds>upper_limit_FR)].index,inplace=True)
company.shape

(59987, 15)

In [19]:
# For founded_at
company['founded_at']=pd.to_datetime(company['founded_at'], format='%Y-%m-%d').dt.year

# closed_at
company['closed_at']=pd.to_datetime(company['closed_at'], format='%Y-%m-%d').dt.year

# first_funding_at
company['first_funding_at']=pd.to_datetime(company['first_funding_at'], format='%Y-%m-%d').dt.year

# last_funding_at
company['last_funding_at']=pd.to_datetime(company['last_funding_at'], format='%Y-%m-%d').dt.year

# # first_milestone_at
company['first_milestone_at']=pd.to_datetime(company['first_milestone_at'], format='%Y-%m-%d').dt.year


# # last_milestone_at
company['last_milestone_at']=pd.to_datetime(company['last_milestone_at'], format='%Y-%m-%d').dt.year

 #### Generalize the categorical data i.e. category_code and  country_code 

In [20]:
#category_encoading
category_others_index = company.category_code.value_counts()[15:].index
company['category_code'] = company['category_code'].replace(to_replace=category_others_index,value='other')

In [21]:
# Let's check if we've more than 15 columns
print('Unique no of category_code : ',company['category_code'].nunique())
company.category_code.value_counts()

Unique no of category_code :  15


other               12820
software            10888
web                  7752
ecommerce            4831
mobile               3700
advertising          3575
games_video          2789
consulting           2780
enterprise           2472
biotech              2006
public_relations     1561
hardware             1434
network_hosting      1254
education            1149
search                976
Name: category_code, dtype: int64

In [22]:
# Lets keep the country as same of it falls under above 10 otherwise let's replaceit with other.
country_code_others_index = company['country_code'].value_counts()[9:].index
company['country_code'] = company['country_code'].replace(to_replace=country_code_others_index,value='other')

In [23]:
# Let's check if we've more than 10 columns
print('Unique no of country_code : ',company['country_code'].nunique())
company.country_code.value_counts()

Unique no of country_code :  10


USA      33844
other    10335
GBR       4959
IND       2985
CAN       2609
DEU       1358
FRA       1226
AUS       1027
ESP        880
ISR        764
Name: country_code, dtype: int64

### 2. Create new variables¶
    a. Create new feature isClosed from closed_at and status.
    b. Create new feature 'active_days'

#### 2.a. Create new feature isClosed from closed_at and status.
     - if the value in status is 'operating' or 'ipo', Let's put 0.
     - Where as if the value is 'acquired' or 'closed', let's put 1.

In [24]:
def isClosed(row):
    if row['status'] == 'operating' or row['status'] == 'ipo':
        return 0
    else:
        return 1
company['isClosed'] = company.apply(lambda rw: isClosed(rw),axis=1)
company

Unnamed: 0,category_code,status,founded_at,closed_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed
5,advertising,operating,2007,,other,,,,,,,,2.0,30.427755,-9.598107,0
6,other,operating,2008,,IND,,,,,,,,,22.307159,73.181219,0
12,advertising,operating,2008,,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0
13,web,acquired,2007,,USA,2008.0,2008.0,1.0,5000000.0,2008.0,2012.0,3.0,14.0,37.386052,-122.083851,1
15,games_video,operating,2008,,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,2007,,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0
196549,public_relations,operating,2007,,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0
196550,consulting,operating,1959,,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0
196551,search,operating,2008,,USA,,,,,,,,1.0,34.052234,-118.243685,0


In [25]:
company['closed_at']=(np.where((company['status']=='operating')|(company['status']=='ipo'),2021,company['closed_at']))

In [26]:
company.dropna(axis=0, subset=['closed_at'], inplace=True)

In [27]:
closed_at=company['closed_at']
founded_at=company['founded_at']

In [28]:
active_days=365*((closed_at).astype('float64')-(founded_at).astype('float64'))
company['Active_Days']=active_days
index_name=company['Active_Days'].sort_values().head(68).index
company.drop(index_name,inplace=True)
company['Active_Days']

5          5110.0
6          4745.0
12         4745.0
15         4745.0
20         6570.0
           ...   
196548     5110.0
196549     5110.0
196550    22630.0
196551     4745.0
196552     5110.0
Name: Active_Days, Length: 56457, dtype: float64

In [29]:
company.drop(['closed_at'], axis=1,inplace=True)

In [30]:
company

Unnamed: 0,category_code,status,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed,Active_Days
5,advertising,operating,2007,other,,,,,,,,2.0,30.427755,-9.598107,0,5110.0
6,other,operating,2008,IND,,,,,,,,,22.307159,73.181219,0,4745.0
12,advertising,operating,2008,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0,4745.0
15,games_video,operating,2008,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0,4745.0
20,other,operating,2003,USA,2011.0,2012.0,3.0,10125293.0,2010.0,2010.0,1.0,6.0,30.267153,-97.743061,0,6570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,operating,2007,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0,5110.0
196549,public_relations,operating,2007,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0,5110.0
196550,consulting,operating,1959,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0,22630.0
196551,search,operating,2008,USA,,,,,,,,1.0,34.052234,-118.243685,0,4745.0


In [31]:
company.isna().sum()

category_code             0
status                    0
founded_at                0
country_code              0
first_funding_at      38839
last_funding_at       38839
funding_rounds        38705
funding_total_usd     40935
first_milestone_at    26259
last_milestone_at     26259
milestones            26259
relationships         14453
lat                    2236
lng                    2236
isClosed                  0
Active_Days               0
dtype: int64

In [32]:
company.drop(['status'], axis=1,inplace=True)

In [33]:
company.dtypes

category_code          object
founded_at              int64
country_code           object
first_funding_at      float64
last_funding_at       float64
funding_rounds        float64
funding_total_usd     float64
first_milestone_at    float64
last_milestone_at     float64
milestones            float64
relationships         float64
lat                   float64
lng                   float64
isClosed                int64
Active_Days           float64
dtype: object

In [34]:
X = company.drop("isClosed", axis=1)
y = company["isClosed"]

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

### Creating pipline

In [36]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
import xgboost as xgb

In [37]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')

In [38]:
company

Unnamed: 0,category_code,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,isClosed,Active_Days
5,advertising,2007,other,,,,,,,,2.0,30.427755,-9.598107,0,5110.0
6,other,2008,IND,,,,,,,,,22.307159,73.181219,0,4745.0
12,advertising,2008,USA,,,,,2008.0,2008.0,1.0,2.0,35.686975,-105.937799,0,4745.0
15,games_video,2008,USA,,,,,2008.0,2008.0,1.0,3.0,33.078655,-116.601964,0,4745.0
20,other,2003,USA,2011.0,2012.0,3.0,10125293.0,2010.0,2010.0,1.0,6.0,30.267153,-97.743061,0,6570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196548,ecommerce,2007,USA,,,,,2013.0,2013.0,2.0,5.0,37.774929,-122.419415,0,5110.0
196549,public_relations,2007,USA,2008.0,2008.0,1.0,750000.0,2013.0,2013.0,1.0,14.0,37.338208,-121.886329,0,5110.0
196550,consulting,1959,USA,,,,,2012.0,2013.0,3.0,44.0,38.882334,-77.171091,0,22630.0
196551,search,2008,USA,,,,,,,,1.0,34.052234,-118.243685,0,4745.0


In [39]:
company.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56457 entries, 5 to 196552
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   category_code       56457 non-null  object 
 1   founded_at          56457 non-null  int64  
 2   country_code        56457 non-null  object 
 3   first_funding_at    17618 non-null  float64
 4   last_funding_at     17618 non-null  float64
 5   funding_rounds      17752 non-null  float64
 6   funding_total_usd   15522 non-null  float64
 7   first_milestone_at  30198 non-null  float64
 8   last_milestone_at   30198 non-null  float64
 9   milestones          30198 non-null  float64
 10  relationships       42004 non-null  float64
 11  lat                 54221 non-null  float64
 12  lng                 54221 non-null  float64
 13  isClosed            56457 non-null  int64  
 14  Active_Days         56457 non-null  float64
dtypes: float64(11), int64(2), object(2)
memory usage: 6.

In [40]:
trf1 = ColumnTransformer([
    ('impute_num',SimpleImputer(),slice(3,13)),
],remainder='passthrough')

In [41]:
first_step = trf1.fit_transform(X_train)
first_step

array([[2012.0, 2012.0, 1.0, ..., 2008, 'USA', 4745.0],
       [2010.5045838959563, 2011.0453414824817, 1.412528216704289, ...,
        2011, 'USA', 3650.0],
       [2010.5045838959563, 2011.0453414824817, 1.412528216704289, ...,
        2012, 'DEU', 3285.0],
       ...,
       [2010.5045838959563, 2011.0453414824817, 1.412528216704289, ...,
        2007, 'USA', 5110.0],
       [2010.5045838959563, 2011.0453414824817, 1.412528216704289, ...,
        2001, 'GBR', 7300.0],
       [2011.0, 2012.0, 2.0, ..., 2011, 'other', 3650.0]], dtype=object)

In [42]:
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2012.0,2012.0,1.0,4000000.0,2009.63671,2010.145424,1.324792,3.0,33.953349,-117.396156,other,2008,USA,4745.0
1,2010.504584,2011.045341,1.412528,4299192.518184,2011.0,2011.0,1.0,1.0,33.49417,-111.926052,other,2011,USA,3650.0
2,2010.504584,2011.045341,1.412528,4299192.518184,2012.0,2012.0,1.0,1.0,50.903973,7.402289,software,2012,DEU,3285.0
3,2010.504584,2011.045341,1.412528,4299192.518184,2009.63671,2010.145424,1.324792,1.0,51.507351,-0.127758,public_relations,2001,GBR,7300.0
4,2010.504584,2011.045341,1.412528,4299192.518184,2009.63671,2010.145424,1.324792,6.0,42.498994,-83.367717,consulting,1997,USA,8760.0


In [43]:
# checking for missing values
pd.DataFrame(first_step).isna().sum().any()

False

In [44]:
pd.DataFrame(first_step).shape

(45165, 14)

In [45]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_category',OneHotEncoder(sparse=False,handle_unknown='ignore'),[10]),
    ('ohe_country',OneHotEncoder(sparse=False,handle_unknown='ignore'),[12])
],remainder='passthrough')

In [46]:
second_step = trf2.fit_transform(first_step)
pd.DataFrame(second_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2012.0,2012.0,1.0,4000000.0,2009.63671,2010.145424,1.324792,3.0,33.953349,-117.396156,2008,4745.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.504584,2011.045341,1.412528,4299192.518184,2011.0,2011.0,1.0,1.0,33.49417,-111.926052,2011,3650.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.504584,2011.045341,1.412528,4299192.518184,2012.0,2012.0,1.0,1.0,50.903973,7.402289,2012,3285.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2010.504584,2011.045341,1.412528,4299192.518184,2009.63671,2010.145424,1.324792,1.0,51.507351,-0.127758,2001,7300.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2010.504584,2011.045341,1.412528,4299192.518184,2009.63671,2010.145424,1.324792,6.0,42.498994,-83.367717,1997,8760.0


In [47]:
pd.DataFrame(second_step).shape

(45165, 37)

In [48]:
# imbalanced dataset
# trf3 = ColumnTransformer([
#     ('oversampling',RandomOverSampler(sampling_strategy = 'minority'),slice(0,37))
# ],remainder='passthrough')


In [49]:
# third_step = trf3.fit_transform(second_step)
# pd.DataFrame(third_step).head()

In [50]:
# pd.DataFrame(third_step).shape

In [51]:
# Scaling
trf4 = ColumnTransformer([
    ('scale',StandardScaler(),slice(0,37))
])

In [52]:
fourth_step = trf4.fit_transform(second_step)
pd.DataFrame(fourth_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
0,-0.253493,-0.180234,-0.224059,-0.300516,-0.143879,-0.206723,-0.220549,-0.155775,-0.255269,-0.146423,1.898414,-0.16379,-0.129594,-0.462441,-0.383126,-0.13454,-0.215309,-0.153069,-0.124648,-0.145156,-0.30056,-0.23554,-0.113457,0.898995,-0.465225,1.085028,0.7399174,-1.129986,-0.09326468,1.96208e-10,1.494582e-10,-4.747436e-13,-0.063456,-0.19423,-0.995121,0.221919,-0.192866
1,-0.253493,-0.180234,-0.224059,-0.300516,-0.143879,-0.206723,-0.220549,-0.155775,-0.255269,-0.146423,1.898414,-0.16379,-0.129594,-0.462441,-0.383126,-0.13454,-0.215309,-0.153069,-0.124648,-0.145156,-0.30056,-0.23554,-0.113457,0.898995,-0.465225,3.344051e-10,2.772069e-10,-3.221734e-12,-5.408533e-13,0.4573975,0.2961169,-0.7071519,-0.227796,-0.222872,-0.917396,0.53273,-0.499031
2,-0.253493,-0.180234,-0.224059,-0.300516,-0.143879,-0.206723,-0.220549,-0.155775,-0.255269,-0.146423,-0.526756,-0.16379,-0.129594,2.162437,-0.383126,-0.13454,-0.215309,6.532984,-0.124648,-0.145156,-0.30056,-0.23554,-0.113457,-1.112354,-0.465225,3.344051e-10,2.772069e-10,-3.221734e-12,-5.408533e-13,0.7929076,0.6426241,-0.7071519,-0.227796,0.863101,0.778147,0.636334,-0.601086
3,-0.253493,-0.180234,-0.224059,-0.300516,-0.143879,-0.206723,-0.220549,-0.155775,-0.255269,-0.146423,-0.526756,6.10536,-0.129594,-0.462441,-0.383126,-0.13454,-0.215309,-0.153069,-0.124648,-0.145156,3.327123,-0.23554,-0.113457,-1.112354,-0.465225,3.344051e-10,2.772069e-10,-3.221734e-12,-5.408533e-13,1.96208e-10,1.494582e-10,-4.747436e-13,-0.227796,0.900737,0.671153,-0.503309,0.521519
4,-0.253493,-0.180234,4.463116,-0.300516,-0.143879,-0.206723,-0.220549,-0.155775,-0.255269,-0.146423,-0.526756,-0.16379,-0.129594,-0.462441,-0.383126,-0.13454,-0.215309,-0.153069,-0.124648,-0.145156,-0.30056,-0.23554,-0.113457,0.898995,-0.465225,3.344051e-10,2.772069e-10,-3.221734e-12,-5.408533e-13,1.96208e-10,1.494582e-10,-4.747436e-13,0.183055,0.338822,-0.511609,-0.917724,0.929738


In [53]:
trf5 = xgb.XGBClassifier()

In [54]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [55]:
pipe.fit(X_train, y_train)

In [56]:
# Predict
y_pred = pipe.predict(X_test)

In [57]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.9994686503719448

In [58]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[11002     0]
 [    6   284]]
0.9994686503719448
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11002
           1       1.00      0.98      0.99       290

    accuracy                           1.00     11292
   macro avg       1.00      0.99      0.99     11292
weighted avg       1.00      1.00      1.00     11292



In [59]:
# Saving model to disk
import pickle 
pickle.dump(pipe, open('model_pipe.pkl','wb'))

In [60]:
X_train[:145455].head(50)

Unnamed: 0,category_code,founded_at,country_code,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,first_milestone_at,last_milestone_at,milestones,relationships,lat,lng,Active_Days
18188,other,2008,USA,2012.0,2012.0,1.0,4000000.0,,,,3.0,33.953349,-117.396156,4745.0
15082,other,2011,USA,,,,,2011.0,2011.0,1.0,1.0,33.49417,-111.926052,3650.0
33075,software,2012,DEU,,,,,2012.0,2012.0,1.0,1.0,50.903973,7.402289,3285.0
142710,public_relations,2001,GBR,,,,,,,,1.0,51.507351,-0.127758,7300.0
37888,consulting,1997,USA,,,,,,,,6.0,42.498994,-83.367717,8760.0
148723,biotech,2005,USA,2008.0,2009.0,2.0,6700000.0,,,,3.0,30.267153,-97.743061,5840.0
138131,other,1968,USA,,,,,,,,10.0,41.878114,-87.629798,19345.0
176929,web,2009,USA,,,,,,,,2.0,33.150674,-96.823612,4380.0
174907,software,2000,other,,,,,2008.0,2008.0,1.0,7.0,48.507933,32.262317,7665.0
33763,ecommerce,2012,USA,,,,,,,,,38.252665,-85.758456,3285.0


In [61]:
y_train[145455]

1