In [None]:
### Import bibliotek ###
# manimpulacja danymi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as p
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# nasze funkcje
import our_functions

In [None]:
dataset = pd.read_csv('startup data.csv')

"""Usuwanie zbędnych kolumn"""

dataset=dataset.drop_duplicates(subset=['name'])
dataset = dataset.drop(['Unnamed: 0', 
                        'Unnamed: 6',
                        'latitude', 
                        'longitude', 
                        'state_code', 
                        'state_code.1', 
                        'zip_code', 
                        'object_id'], axis = 1)

"""Wyliczenie czasu trwania projektów w latach"""

time_columns = ['founded_at', 'closed_at', 'first_funding_at', 'last_funding_at']
for column in time_columns:
  dataset[column] = pd.to_datetime(dataset[column])

dataset['closed_date'] = dataset['closed_at']
dataset['closed_date'] = dataset['closed_date'].fillna('2013-12-31')

dataset['age'] = dataset['closed_date'] - dataset['founded_at']

dataset["age"] = round(dataset['age']/np.timedelta64(1,'Y'))

"""Usuwanie wierszy z wartościami 0"""

time_columns = ['age_first_funding_year', 
                'age_last_funding_year', 
                'age_first_milestone_year', 
                'age_last_milestone_year', 
                'age']
for column in time_columns:
  dataset = dataset.drop(dataset[dataset[column] < 0].index)

"""Wypełnianie wartości NaN"""

dataset['age_first_milestone_year'] = dataset['age_first_milestone_year'].fillna(0)
dataset['age_last_milestone_year'] = dataset['age_last_milestone_year'].fillna(0)

dataset_age_group = dataset[dataset['labels'] == 1].groupby(['age']).agg({'labels' : 'count'}).reset_index()
dataset_age_group.columns = ['age', 'total_succes']

dataset_age_group_total = dataset.groupby(['age']).agg({'labels' : 'count'}).reset_index()
dataset_age_group_total.columns = ['age', 'total']

dataset_age_group = dataset_age_group.merge(dataset_age_group_total, on='age')
dataset_age_group['succes_rate'] = round((dataset_age_group['total_succes']/dataset_age_group['total'])*100, 2)

In [None]:
X = dataset[['age',
             'age_first_funding_year',
             'age_last_funding_year',
             'age_first_milestone_year',
             'age_last_milestone_year',
             'relationships',
             'funding_rounds',
             'funding_total_usd',
             'milestones',
             'is_software',
             'is_web',
             'is_mobile',
             'is_enterprise',
             'is_advertising',
             'is_gamesvideo',
             'is_ecommerce',
             'is_biotech',
             'is_consulting',
             'is_othercategory',
             'has_VC',
             'has_angel',
             'has_roundA',
             'has_roundB',
             'has_roundC',
             'has_roundD',
             'avg_participants',
             'is_top500']
            ]
y = dataset['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=68)

In [None]:
parameters = {'n_estimators': [85, 90, 100,],
              'max_depth': [None, 2, 4, 5, 10, 12, 14],
              'min_samples_split': [2, 4, 5],
              'min_samples_leaf': [1, 2, 4, 5],
              }
cv_stratify = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 3)
rfc = GridSearchCV(RandomForestClassifier(random_state = 2), parameters, cv = cv_stratify, scoring = 'f1')
rfc.fit(X_train, y_train)
print(rfc.best_params_)
print(rfc.best_score_)

In [None]:
y_pred_test = rfc.best_estimator_.predict(X_test)
print('f1-',f1_score(y_test, y_pred_test))
print('accuarcy-', accuracy_score(y_test, y_pred_test))
print('precision-', precision_score(y_test, y_pred_test))
print('recall-', recall_score(y_test, y_pred_test))

In [None]:
explainer = shap.TreeExplainer(rfc.best_estimator_)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values[1], X_test,plot_type='bar')