In [None]:
### Libraries import ###
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as p
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from joblib import dump

In [None]:
dataset = pd.read_csv('startup data.csv')

"""Deleting unneccessary columns"""

dataset=dataset.drop_duplicates(subset=['name'])
dataset = dataset.drop(['Unnamed: 0', 
                        'Unnamed: 6',
                        'latitude', 
                        'longitude', 
                        'state_code', 
                        'state_code.1', 
                        'zip_code', 
                        'object_id'], axis = 1)

"""Calculating the age of the startup (in years)"""

time_columns = ['founded_at', 'closed_at', 'first_funding_at', 'last_funding_at']
for column in time_columns:
  dataset[column] = pd.to_datetime(dataset[column])

dataset['closed_date'] = dataset['closed_at']
dataset['closed_date'] = dataset['closed_date'].fillna('2013-12-31')

dataset['age'] = dataset['closed_date'] - dataset['founded_at']

dataset["age"] = round(dataset['age']/np.timedelta64(1,'Y'))

"""Ereasing rows with 0 values"""

time_columns = ['age_first_funding_year', 
                'age_last_funding_year', 
                'age_first_milestone_year', 
                'age_last_milestone_year', 
                'age']
for column in time_columns:
  dataset = dataset.drop(dataset[dataset[column] < 0].index)

"""Filling in the missing (NaN) fields"""

dataset['age_first_milestone_year'] = dataset['age_first_milestone_year'].fillna(0)
dataset['age_last_milestone_year'] = dataset['age_last_milestone_year'].fillna(0)

In [None]:
"""Graphs and deeper analysis """

"""Graph of dependency of success (startup being acquired) based on age, funding years and milestones years"""

columns = ['age',
           'age_first_funding_year',
           'age_last_funding_year', 
           'age_first_milestone_year',
           'age_last_milestone_year']

plt.figure(figsize=(15, 10))
for i in columns:
  dataset2 = dataset.copy()
  dataset2[i] = round(dataset2[i])
  dataset_age_group=dataset2[dataset2['labels']==1].groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group.columns=[i,'total_succes']

  dataset_age_group_total=dataset2.groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group_total.columns=[i,'total']

  dataset_age_group=dataset_age_group.merge(dataset_age_group_total,on=i)
  dataset_age_group['succes_rate']=round((dataset_age_group['total_succes']/dataset_age_group['total'])*100,2)
    
  plt.plot(dataset_age_group[i], dataset_age_group['succes_rate'], label=i, linewidth=3 )
plt.legend()
plt.xlabel('years')
plt.ylabel('succes rate [%]')
plt.title('Succes rate in comparison to age and funding and milestone year')
plt.show()

In [None]:
"""Graph of dependency of success (startup being acquired) based on relationships, funding and milestones"""

columns = ['relationships', 'funding_rounds', 'milestones']

plt.figure(figsize=(15, 10))
for i in columns:
  dataset2 = dataset.copy()
  dataset2[i] = round(dataset2[i])
  dataset_age_group=dataset2[dataset2['labels']==1].groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group.columns=[i,'total_succes']

  dataset_age_group_total=dataset2.groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group_total.columns=[i,'total']

  dataset_age_group=dataset_age_group.merge(dataset_age_group_total,on=i)
  dataset_age_group['succes_rate']=round((dataset_age_group['total_succes']/dataset_age_group['total'])*100,2)
    
  plt.plot(dataset_age_group[i], dataset_age_group['succes_rate'], label=i, linewidth=3 )
plt.legend()
plt.xlabel('years')
plt.ylabel('succes rate [%]')
plt.title('Succes rate in comparison to number of relationships, funding_rounds and milestones')
plt.show()

In [None]:
"""Data distribution"""

columns = ['age', 
           'age_first_funding_year', 
           'age_last_funding_year',
           'age_first_milestone_year', 
           'age_last_milestone_year', 
           'relationships', 
           'funding_rounds', 
           'funding_total_usd',
           'milestones',
           'avg_participants']

fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in dataset[columns].items():
    sns.boxplot(y=k, data=dataset, ax=axs[index])
    index = index + 1

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in dataset[columns].items():
    sns.distplot(v, ax=axs[index])
    index = index + 1
plt.tight_layout()

In [None]:
"""Creating X and Y datasets"""

X = dataset[['age',
             'age_first_funding_year',
             'age_last_funding_year',
             'age_first_milestone_year',
             'age_last_milestone_year',
             'relationships',
             'funding_rounds',
             'funding_total_usd',
             'milestones',
             'is_software',
             'is_web',
             'is_mobile',
             'is_enterprise',
             'is_advertising',
             'is_gamesvideo',
             'is_ecommerce',
             'is_biotech',
             'is_consulting',
             'is_othercategory',
             'has_VC',
             'has_angel',
             'has_roundA',
             'has_roundB',
             'has_roundC',
             'has_roundD',
             'avg_participants',
             'is_top500']
            ]
y = dataset['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=68, stratify = y)

In [None]:
"""Random Forest - GridSearch"""

parameters = {'n_estimators': [50, 100, 150,],
              'max_depth': [2, 4, 10, 12, 14],
              'min_samples_split': [8, 12, 20],
              'min_samples_leaf': [2, 4, 8],
              }
cv_stratify = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 3)
rfc = GridSearchCV(RandomForestClassifier(random_state = 2), parameters, cv = cv_stratify, scoring = 'f1')
rfc.fit(X_train, y_train)
print('RandomForest best params:', rfc.best_params_)
print('RandomForest best f1 score:',rfc.best_score_)

In [None]:
"""Results on a test dataset"""

y_pred_test = rfc.best_estimator_.predict(X_test)
print('RandomForest metrics')
print('f1:',f1_score(y_test, y_pred_test))
print('accuracy:', accuracy_score(y_test, y_pred_test))
print('precision:', precision_score(y_test, y_pred_test))
print('recall:', recall_score(y_test, y_pred_test))

In [None]:
explainer = shap.TreeExplainer(rfc.best_estimator_)
shap_values = explainer.shap_values(X_test)

In [None]:
""""Establishing Shapely value, which tells us how much specific parameter 
has contributed to the prediction of the target variable. The graph of contributions consists 
of list of parameters that are the most important to the model in an descending order. 
The lower the importance of a parameter, the lower the force it contributes to influencing target variable.
"""
shap.summary_plot(shap_values[1], X_test,plot_type='bar')

In [None]:
"""Graph showing positive and negative relation to the target variable"""
shap.summary_plot(shap_values[1], X_test)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test)

In [None]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

In [None]:
y_pred = xgb_classifier.predict(X_test)
print('XG Boost metrics')
print('f1-', f1_score(y_test, y_pred))
print('accuracy-', accuracy_score(y_test, y_pred))
print('precision-', precision_score(y_test, y_pred))
print('recall-', recall_score(y_test, y_pred))

In [None]:
explainer = shap.TreeExplainer(xgb_classifier)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar')

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X_test)

In [None]:
"""Models save (in pickle) to be reused in Heroku instantly"""
dump(rfc.best_estimator_, "saved_rfc.pkl")
xgb_classifier.save_model('saved_gbc.pkl')