In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
main_df = pd.read_csv("merged_data.csv",parse_dates = True)
for x in ["founded_at", "first_funding_at", "last_funding_at"]:
    main_df[x] = pd.to_datetime(main_df[x])

main_df

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label
0,#waywire,NY,New York,2012-06-01,2012-06-30,2012-06-30,1,1750000,news,operating,...,0,0,0,0,0,0,1,0,0,Failure
1,1000memories,CA,San Francisco,2010-07-01,2010-01-01,2011-02-16,2,2535000,web,acquired,...,0,0,0,0,0,0,0,6,7,Success
2,100Plus,CA,San Francisco,2011-09-16,2011-11-02,2011-11-30,2,1250000,analytics,acquired,...,0,0,0,0,0,0,1,2,2,Success
3,1010data,NY,New York,2000-01-01,2010-03-08,2010-03-08,1,35000000,software,operating,...,0,0,0,0,0,0,0,122,122,Success
4,121nexus,RI,Providence,2011-10-07,2012-02-06,2013-07-01,4,719000,software,operating,...,0,0,0,0,0,0,0,4,21,Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,y prime,PA,Malvern,2006-01-01,2013-09-07,2013-09-07,1,5000000,health,operating,...,0,0,0,0,0,0,1,92,92,Failure
8060,yaM Labs,VA,Alexandria,2009-12-03,2011-03-02,2011-03-02,1,500000,software,operating,...,0,0,0,0,0,0,0,15,15,Failure
8061,ybuy,CA,Beverly Hills,2011-08-01,2012-01-17,2012-07-05,2,1750000,web,operating,...,0,0,0,0,0,0,0,5,11,Failure
8062,zozi,CA,San Francisco,2007-01-01,2008-08-01,2013-04-04,4,21300000,ecommerce,operating,...,0,0,0,1,0,0,0,19,75,Success


In [3]:
main_df.columns

Index(['name', 'state_code', 'city', 'founded_at', 'first_funding_at',
       'last_funding_at', 'funding_rounds', 'funding_total_usd',
       'category_code', 'status', 'is_software', 'is_web', 'is_mobile',
       'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce',
       'is_biotech', 'is_consulting', 'is_othercategory',
       'months_to_first_funding', 'months_to_last_funding', 'label'],
      dtype='object')

# Clean Merged Data
- Remove outliers
- Adjust skewness

In [4]:
y = main_df[["funding_total_usd"]]

## Removing Skewness

In [5]:
y.skew()

funding_total_usd    33.713432
dtype: float64

In [6]:
from scipy import stats
from scipy.special import boxcox, inv_boxcox
transformed_data = stats.boxcox(main_df['funding_total_usd'])[0]
param = stats.boxcox(main_df['funding_total_usd'])[1]
y = pd.Series(transformed_data)
y.skew()

-0.002339944830612883

In [7]:
main_df["funding_total_usd"] = y
main_df

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label
0,#waywire,NY,New York,2012-06-01,2012-06-30,2012-06-30,1,39.947329,news,operating,...,0,0,0,0,0,0,1,0,0,Failure
1,1000memories,CA,San Francisco,2010-07-01,2010-01-01,2011-02-16,2,42.207989,web,acquired,...,0,0,0,0,0,0,0,6,7,Success
2,100Plus,CA,San Francisco,2011-09-16,2011-11-02,2011-11-30,2,37.982925,analytics,acquired,...,0,0,0,0,0,0,1,2,2,Success
3,1010data,NY,New York,2000-01-01,2010-03-08,2010-03-08,1,61.580303,software,operating,...,0,0,0,0,0,0,0,122,122,Success
4,121nexus,RI,Providence,2011-10-07,2012-02-06,2013-07-01,4,34.927286,software,operating,...,0,0,0,0,0,0,0,4,21,Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,y prime,PA,Malvern,2006-01-01,2013-09-07,2013-09-07,1,46.631665,health,operating,...,0,0,0,0,0,0,1,92,92,Failure
8060,yaM Labs,VA,Alexandria,2009-12-03,2011-03-02,2011-03-02,1,33.031461,software,operating,...,0,0,0,0,0,0,0,15,15,Failure
8061,ybuy,CA,Beverly Hills,2011-08-01,2012-01-17,2012-07-05,2,39.947329,web,operating,...,0,0,0,0,0,0,0,5,11,Failure
8062,zozi,CA,San Francisco,2007-01-01,2008-08-01,2013-04-04,4,57.415282,ecommerce,operating,...,0,0,0,1,0,0,0,19,75,Success


## Removing Outliers

In [8]:
from pandas.api.types import is_string_dtype
def num_outliers(df):
		for col in df.columns:
				if is_string_dtype(df[col]) == True:
						continue
				else:
				    q1 = df[col].describe()["25%"]
				    q3 = df[col].describe()["75%"]
				    iqr = q3 - q1
				    outliers = 0
				    for x in df[col]:
				        if x > (q3 + (1.5 * iqr)) or x < (q1 - (1.5*iqr)):
				            outliers += 1
				    print(df[col].name + " has " + str(outliers) + " outliers")

In [9]:
num_outliers(main_df[["funding_total_usd"]])

funding_total_usd has 35 outliers


In [10]:
Q1 = main_df[["funding_total_usd"]].quantile(0.25)
Q3 = main_df[["funding_total_usd"]].quantile(0.75)
IQR = Q3 - Q1

adjusted_df = main_df[["funding_total_usd"]][~((main_df[["funding_total_usd"]]< (Q1 - 1.5 * IQR)) |(main_df[["funding_total_usd"]]> (Q3 + 1.5 * IQR))).any(axis=1)]

In [11]:
main_df["funding_total_usd"] = adjusted_df
main_df

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label
0,#waywire,NY,New York,2012-06-01,2012-06-30,2012-06-30,1,39.947329,news,operating,...,0,0,0,0,0,0,1,0,0,Failure
1,1000memories,CA,San Francisco,2010-07-01,2010-01-01,2011-02-16,2,42.207989,web,acquired,...,0,0,0,0,0,0,0,6,7,Success
2,100Plus,CA,San Francisco,2011-09-16,2011-11-02,2011-11-30,2,37.982925,analytics,acquired,...,0,0,0,0,0,0,1,2,2,Success
3,1010data,NY,New York,2000-01-01,2010-03-08,2010-03-08,1,61.580303,software,operating,...,0,0,0,0,0,0,0,122,122,Success
4,121nexus,RI,Providence,2011-10-07,2012-02-06,2013-07-01,4,34.927286,software,operating,...,0,0,0,0,0,0,0,4,21,Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,y prime,PA,Malvern,2006-01-01,2013-09-07,2013-09-07,1,46.631665,health,operating,...,0,0,0,0,0,0,1,92,92,Failure
8060,yaM Labs,VA,Alexandria,2009-12-03,2011-03-02,2011-03-02,1,33.031461,software,operating,...,0,0,0,0,0,0,0,15,15,Failure
8061,ybuy,CA,Beverly Hills,2011-08-01,2012-01-17,2012-07-05,2,39.947329,web,operating,...,0,0,0,0,0,0,0,5,11,Failure
8062,zozi,CA,San Francisco,2007-01-01,2008-08-01,2013-04-04,4,57.415282,ecommerce,operating,...,0,0,0,1,0,0,0,19,75,Success


In [12]:
# Visually inspecting outliers
main_df.loc[main_df["funding_total_usd"].isnull() == True, :]

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label
54,A123 Systems,MA,Watertown,2001-01-01,2005-11-01,2012-06-07,8,,nanotech,ipo,...,0,0,0,0,0,0,1,58,137,Success
74,AOL,NY,New York,1985-05-24,2005-12-01,2005-12-01,1,,web,ipo,...,0,0,0,0,0,0,0,247,247,Success
92,Abound Solar,CO,Loveland,2007-01-01,2010-07-06,2010-12-15,2,,cleantech,closed,...,0,0,0,0,0,0,1,42,47,Failure
779,Better Place,CA,Palo Alto,2007-10-29,2007-01-01,2012-11-02,6,,cleantech,acquired,...,0,0,0,0,0,0,1,9,61,Success
896,Bloom Energy,CA,Sunnyvale,2002-01-01,2008-01-01,2013-05-10,4,,cleantech,operating,...,0,0,0,0,0,0,1,72,136,Success
1040,BrightSource Energy,CA,Oakland,2004-01-01,2006-11-01,2013-07-25,9,,cleantech,ipo,...,0,0,0,0,0,0,1,34,114,Success
1410,Clearwire,WA,Kirkland,2003-10-01,2008-05-06,2013-02-27,4,,mobile,acquired,...,0,0,0,0,0,0,0,55,112,Success
1842,Datapipe,NJ,Jersey City,1998-01-01,2008-07-08,2013-05-31,3,,network_hosting,operating,...,0,0,0,0,0,0,1,126,184,Success
1866,Deem,AZ,Foster City,2000-01-01,2004-01-01,2013-09-24,8,,ecommerce,operating,...,0,0,0,1,0,0,0,48,164,Success
1882,Demand Media,CA,Santa Monica,2006-06-01,2006-04-01,2013-09-04,6,,news,ipo,...,0,0,0,0,0,0,1,2,87,Success


In [13]:
# Dropping outliers (rows with NaN for funding_total_usd)
main_df.dropna(axis = 0, inplace=True)
main_df

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label
0,#waywire,NY,New York,2012-06-01,2012-06-30,2012-06-30,1,39.947329,news,operating,...,0,0,0,0,0,0,1,0,0,Failure
1,1000memories,CA,San Francisco,2010-07-01,2010-01-01,2011-02-16,2,42.207989,web,acquired,...,0,0,0,0,0,0,0,6,7,Success
2,100Plus,CA,San Francisco,2011-09-16,2011-11-02,2011-11-30,2,37.982925,analytics,acquired,...,0,0,0,0,0,0,1,2,2,Success
3,1010data,NY,New York,2000-01-01,2010-03-08,2010-03-08,1,61.580303,software,operating,...,0,0,0,0,0,0,0,122,122,Success
4,121nexus,RI,Providence,2011-10-07,2012-02-06,2013-07-01,4,34.927286,software,operating,...,0,0,0,0,0,0,0,4,21,Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,y prime,PA,Malvern,2006-01-01,2013-09-07,2013-09-07,1,46.631665,health,operating,...,0,0,0,0,0,0,1,92,92,Failure
8060,yaM Labs,VA,Alexandria,2009-12-03,2011-03-02,2011-03-02,1,33.031461,software,operating,...,0,0,0,0,0,0,0,15,15,Failure
8061,ybuy,CA,Beverly Hills,2011-08-01,2012-01-17,2012-07-05,2,39.947329,web,operating,...,0,0,0,0,0,0,0,5,11,Failure
8062,zozi,CA,San Francisco,2007-01-01,2008-08-01,2013-04-04,4,57.415282,ecommerce,operating,...,0,0,0,1,0,0,0,19,75,Success


In [14]:
main_df.dtypes

name                               object
state_code                         object
city                               object
founded_at                 datetime64[ns]
first_funding_at           datetime64[ns]
last_funding_at            datetime64[ns]
funding_rounds                      int64
funding_total_usd                 float64
category_code                      object
status                             object
is_software                         int64
is_web                              int64
is_mobile                           int64
is_enterprise                       int64
is_advertising                      int64
is_gamesvideo                       int64
is_ecommerce                        int64
is_biotech                          int64
is_consulting                       int64
is_othercategory                    int64
months_to_first_funding             int64
months_to_last_funding              int64
label                              object
dtype: object

# Further Data Cleaning to Convert/Remove str/dt Columns

In [15]:
target = ["founded_at", "first_funding_at", "last_funding_at"]
replacement = ["year_founded", "year_first_funding", "year_last_funding"]

for x, y in zip(target, replacement):
    main_df[y] = main_df[x].dt.year
main_df

Unnamed: 0,name,state_code,city,founded_at,first_funding_at,last_funding_at,funding_rounds,funding_total_usd,category_code,status,...,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label,year_founded,year_first_funding,year_last_funding
0,#waywire,NY,New York,2012-06-01,2012-06-30,2012-06-30,1,39.947329,news,operating,...,0,0,0,1,0,0,Failure,2012,2012,2012
1,1000memories,CA,San Francisco,2010-07-01,2010-01-01,2011-02-16,2,42.207989,web,acquired,...,0,0,0,0,6,7,Success,2010,2010,2011
2,100Plus,CA,San Francisco,2011-09-16,2011-11-02,2011-11-30,2,37.982925,analytics,acquired,...,0,0,0,1,2,2,Success,2011,2011,2011
3,1010data,NY,New York,2000-01-01,2010-03-08,2010-03-08,1,61.580303,software,operating,...,0,0,0,0,122,122,Success,2000,2010,2010
4,121nexus,RI,Providence,2011-10-07,2012-02-06,2013-07-01,4,34.927286,software,operating,...,0,0,0,0,4,21,Failure,2011,2012,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,y prime,PA,Malvern,2006-01-01,2013-09-07,2013-09-07,1,46.631665,health,operating,...,0,0,0,1,92,92,Failure,2006,2013,2013
8060,yaM Labs,VA,Alexandria,2009-12-03,2011-03-02,2011-03-02,1,33.031461,software,operating,...,0,0,0,0,15,15,Failure,2009,2011,2011
8061,ybuy,CA,Beverly Hills,2011-08-01,2012-01-17,2012-07-05,2,39.947329,web,operating,...,0,0,0,0,5,11,Failure,2011,2012,2012
8062,zozi,CA,San Francisco,2007-01-01,2008-08-01,2013-04-04,4,57.415282,ecommerce,operating,...,1,0,0,0,19,75,Success,2007,2008,2013


In [16]:
main_df.drop(columns = target, inplace=True)

In [17]:
main_df.columns

Index(['name', 'state_code', 'city', 'funding_rounds', 'funding_total_usd',
       'category_code', 'status', 'is_software', 'is_web', 'is_mobile',
       'is_enterprise', 'is_advertising', 'is_gamesvideo', 'is_ecommerce',
       'is_biotech', 'is_consulting', 'is_othercategory',
       'months_to_first_funding', 'months_to_last_funding', 'label',
       'year_founded', 'year_first_funding', 'year_last_funding'],
      dtype='object')

In [18]:
main_df.drop(columns = ["name", "state_code", "city", "category_code", "status"], inplace=True)

In [19]:
main_df

Unnamed: 0,funding_rounds,funding_total_usd,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,label,year_founded,year_first_funding,year_last_funding
0,1,39.947329,0,0,0,0,0,0,0,0,0,1,0,0,Failure,2012,2012,2012
1,2,42.207989,0,1,0,0,0,0,0,0,0,0,6,7,Success,2010,2010,2011
2,2,37.982925,0,0,0,0,0,0,0,0,0,1,2,2,Success,2011,2011,2011
3,1,61.580303,1,0,0,0,0,0,0,0,0,0,122,122,Success,2000,2010,2010
4,4,34.927286,1,0,0,0,0,0,0,0,0,0,4,21,Failure,2011,2012,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8059,1,46.631665,0,0,0,0,0,0,0,0,0,1,92,92,Failure,2006,2013,2013
8060,1,33.031461,1,0,0,0,0,0,0,0,0,0,15,15,Failure,2009,2011,2011
8061,2,39.947329,0,1,0,0,0,0,0,0,0,0,5,11,Failure,2011,2012,2012
8062,4,57.415282,0,0,0,0,0,0,1,0,0,0,19,75,Success,2007,2008,2013


# Oversampling to Address Imbalanced Classes
- We need to balance the classes to make sure the model isn't biased towards one outcome over another
    - We choose SMOTE oversampling in order to prevent information loss from downsampling.
        - SMOTE uses a point from minority class and computing a K nearest neighbour for this point. The synthetic points are generated and added between the point and its neighbours.

## Check Initial Class Balance

In [20]:
len(main_df.loc[main_df["label"] == "Success"])

4509

In [21]:
len(main_df.loc[main_df["label"] == "Failure"])

3520

In [22]:
X = main_df.loc[:, main_df.columns != "label"]
y = main_df[["label"]]

In [23]:
from imblearn.over_sampling import SMOTE
smoter = SMOTE(random_state = 42)

X_smote, y_smote = smoter.fit_resample(X, y)

In [24]:
X_smote

Unnamed: 0,funding_rounds,funding_total_usd,is_software,is_web,is_mobile,is_enterprise,is_advertising,is_gamesvideo,is_ecommerce,is_biotech,is_consulting,is_othercategory,months_to_first_funding,months_to_last_funding,year_founded,year_first_funding,year_last_funding
0,1,39.947329,0,0,0,0,0,0,0,0,0,1,0,0,2012,2012,2012
1,2,42.207989,0,1,0,0,0,0,0,0,0,0,6,7,2010,2010,2011
2,2,37.982925,0,0,0,0,0,0,0,0,0,1,2,2,2011,2011,2011
3,1,61.580303,1,0,0,0,0,0,0,0,0,0,122,122,2000,2010,2010
4,4,34.927286,1,0,0,0,0,0,0,0,0,0,4,21,2011,2012,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9013,1,43.498201,0,0,0,0,0,0,0,0,0,0,17,17,2010,2012,2012
9014,2,32.232742,0,0,0,0,0,0,0,0,0,0,61,78,2006,2011,2012
9015,2,48.348457,0,0,0,0,0,0,0,0,0,0,6,21,2010,2010,2011
9016,7,38.695821,0,0,0,0,0,0,0,0,0,0,5,24,2007,2008,2009


In [25]:
y_smote

Unnamed: 0,label
0,Failure
1,Success
2,Success
3,Success
4,Failure
...,...
9013,Failure
9014,Failure
9015,Failure
9016,Failure


In [26]:
len(y_smote.loc[y_smote["label"] == "Success"]), len(y_smote.loc[y_smote["label"] == "Failure"])

(4509, 4509)

# Model Testing + Feature Importance

## Creating train test split

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size = 0.2, random_state = 42)

## Cross validating different models

In [28]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import StackingClassifier

names = ["KNN",
        "Linear SVM",
         "Gaussian Process",
         "Random Forest",
         "AdaBoost",
         "Logistic Regression",
        ]

kfold = StratifiedKFold(n_splits=50)

classifiers = [
    KNeighborsClassifier(),
    SVC(kernel="linear", C=0.025, random_state = 42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state = 42),
    RandomForestClassifier(random_state = 42),
    AdaBoostClassifier(random_state = 42),
    LogisticRegression(random_state = 42),
]
                                  

In [67]:
def cv_scoring(scoring_metric):
    results = []
    for classifier in classifiers:
        results.append(cross_val_score(classifier, X_train, y_train.values.ravel(), scoring = scoring_metric, cv = kfold, n_jobs = -1))
        
    mean_results = []
    std_results = []
    for result in results:
        mean_results.append(result.mean())
        std_results.append(result.std())
        
    results_df = pd.DataFrame({"Cross Validation Means": mean_results, "Cross Validation Standard Dev": std_results, "Classifier" : names})
    
    sns.barplot("Cross Validation Means", "Classifier", data = results_df).set_xlabel("Mean " + scoring_metric).set_title("Cross Validation Scores For Each Classifier")
    

In [None]:
cv_scoring("accuracy")

## Feature Importance

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.ensemble import StackingClassifier


## Ensembling

# Remember to invert box cox transformation