In [None]:
### Import bibliotek ###
# manimpulacja danymi
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as p
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# nasze funkcje
import our_functions

In [None]:
dataset = pd.read_csv('startup data.csv')

"""Usuwanie zbędnych kolumn"""

dataset=dataset.drop_duplicates(subset=['name'])
dataset = dataset.drop(['Unnamed: 0', 
                        'Unnamed: 6',
                        'latitude', 
                        'longitude', 
                        'state_code', 
                        'state_code.1', 
                        'zip_code', 
                        'object_id'], axis = 1)

"""Wyliczenie czasu trwania projektów w latach"""

time_columns = ['founded_at', 'closed_at', 'first_funding_at', 'last_funding_at']
for column in time_columns:
  dataset[column] = pd.to_datetime(dataset[column])

dataset['closed_date'] = dataset['closed_at']
dataset['closed_date'] = dataset['closed_date'].fillna('2013-12-31')

dataset['age'] = dataset['closed_date'] - dataset['founded_at']

dataset["age"] = round(dataset['age']/np.timedelta64(1,'Y'))

"""Usuwanie wierszy z wartościami 0"""

time_columns = ['age_first_funding_year', 
                'age_last_funding_year', 
                'age_first_milestone_year', 
                'age_last_milestone_year', 
                'age']
for column in time_columns:
  dataset = dataset.drop(dataset[dataset[column] < 0].index)

"""Wypełnianie wartości NaN"""

dataset['age_first_milestone_year'] = dataset['age_first_milestone_year'].fillna(0)
dataset['age_last_milestone_year'] = dataset['age_last_milestone_year'].fillna(0)

dataset_age_group = dataset[dataset['labels'] == 1].groupby(['age']).agg({'labels' : 'count'}).reset_index()
dataset_age_group.columns = ['age', 'total_succes']

dataset_age_group_total = dataset.groupby(['age']).agg({'labels' : 'count'}).reset_index()
dataset_age_group_total.columns = ['age', 'total']

dataset_age_group = dataset_age_group.merge(dataset_age_group_total, on='age')
dataset_age_group['succes_rate'] = round((dataset_age_group['total_succes']/dataset_age_group['total'])*100, 2)

In [None]:
"""Analiza i wykresy """

"""Wykres zależności roku pierwszej i ostatniej dotacji oraz roku osiągnięcia pierwszego i ostaniego istotnego celu"""

columns = ['age',
           'age_first_funding_year',
           'age_last_funding_year', 
           'age_first_milestone_year',
           'age_last_milestone_year']

plt.figure(figsize=(15, 10))
for i in columns:
  dataset2 = dataset.copy()
  dataset2[i] = round(dataset2[i])
  dataset_age_group=dataset2[dataset2['labels']==1].groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group.columns=[i,'total_succes']

  dataset_age_group_total=dataset2.groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group_total.columns=[i,'total']

  dataset_age_group=dataset_age_group.merge(dataset_age_group_total,on=i)
  dataset_age_group['succes_rate']=round((dataset_age_group['total_succes']/dataset_age_group['total'])*100,2)
    
  plt.plot(dataset_age_group[i], dataset_age_group['succes_rate'], label=i, linewidth=3 )
plt.legend()
plt.xlabel('years')
plt.ylabel('succes rate [%]')
plt.title('succes rate compare to age and funding and milestone year')
plt.show()

In [None]:
"""Wykres zależności liczby związków, dotacji oraz kluczowych celów"""

columns = ['relationships', 'funding_rounds', 'milestones']

plt.figure(figsize=(15, 10))
for i in columns:
  dataset2 = dataset.copy()
  dataset2[i] = round(dataset2[i])
  dataset_age_group=dataset2[dataset2['labels']==1].groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group.columns=[i,'total_succes']

  dataset_age_group_total=dataset2.groupby([i]).agg({'labels':'count'}).reset_index()
  dataset_age_group_total.columns=[i,'total']

  dataset_age_group=dataset_age_group.merge(dataset_age_group_total,on=i)
  dataset_age_group['succes_rate']=round((dataset_age_group['total_succes']/dataset_age_group['total'])*100,2)
    
  plt.plot(dataset_age_group[i], dataset_age_group['succes_rate'], label=i, linewidth=3 )
plt.legend()
plt.xlabel('years')
plt.ylabel('succes rate [%]')
plt.title('succes rate compare to number of relationships, funding_rounds and milestones')
plt.show()

In [None]:
"""Sprawdzanie rozkładu danych"""

columns = ['age', 
           'age_first_funding_year', 
           'age_last_funding_year',
           'age_first_milestone_year', 
           'age_last_milestone_year', 
           'relationships', 
           'funding_rounds', 
           'funding_total_usd',
           'milestones',
           'avg_participants']

fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in dataset[columns].items():
    sns.boxplot(y=k, data=dataset, ax=axs[index])
    index = index + 1

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in dataset[columns].items():
    sns.distplot(v, ax=axs[index])
    index = index + 1
plt.tight_layout()

In [None]:
"""Tworzenie zbiorów"""

X = dataset[['age',
             'age_first_funding_year',
             'age_last_funding_year',
             'age_first_milestone_year',
             'age_last_milestone_year',
             'relationships',
             'funding_rounds',
             'funding_total_usd',
             'milestones',
             'is_software',
             'is_web',
             'is_mobile',
             'is_enterprise',
             'is_advertising',
             'is_gamesvideo',
             'is_ecommerce',
             'is_biotech',
             'is_consulting',
             'is_othercategory',
             'has_VC',
             'has_angel',
             'has_roundA',
             'has_roundB',
             'has_roundC',
             'has_roundD',
             'avg_participants',
             'is_top500']
            ]
y = dataset['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=68)

In [None]:
"""Random Forest - GridSearch"""

parameters = {'n_estimators': [50, 100, 150,],
              'max_depth': [2, 4, 10, 12, 14],
              'min_samples_split': [8, 12, 20],
              'min_samples_leaf': [2, 4, 8],
              }
cv_stratify = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 3)
rfc = GridSearchCV(RandomForestClassifier(random_state = 2), parameters, cv = cv_stratify, scoring = 'f1')
rfc.fit(X_train, y_train)
print('RandomForest best params:', rfc.best_params_)
print('RandomForest best f1 score:',rfc.best_score_)

In [None]:
"""Wyniki metryk na zbiorze testowym"""

y_pred_test = rfc.best_estimator_.predict(X_test)
print('RandomForest metrics')
print('f1:',f1_score(y_test, y_pred_test))
print('accuarcy:', accuracy_score(y_test, y_pred_test))
print('precision:', precision_score(y_test, y_pred_test))
print('recall:', recall_score(y_test, y_pred_test))

In [None]:
explainer = shap.TreeExplainer(rfc.best_estimator_)
shap_values = explainer.shap_values(X_test)

In [None]:
""""Wyznaczanie wartość Shapley’a, która mówi w jakim stopniu każda cecha przyczyniła się 
do przewidywania zmiennej docelowej. Wykres ważności zmiennych zawiera listę najbardziej 
znaczących zmiennych w porządku malejącym. Górne charakterystyki wnoszą najwięcej do modelu. 
Im niżej tym słabsza cecha i mają mniejszą moc predykcyjną.
"""
shap.summary_plot(shap_values[1], X_test,plot_type='bar')

In [None]:
"""Wykres pokazujący dodatkowo pozytywne i negatywne relacje wartości ze zmienną docelową"""
shap.summary_plot(shap_values[1], X_test)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test)

In [None]:
parameters = {'learning_rate' : [0.1, 0.2, 0.4],
              'n_estimators': [50, 100, 150],
              'max_depth': [2, 4, 8],
              'min_samples_split': [8, 12, 20],
              'min_samples_leaf': [2, 4],
              }
gbc = GridSearchCV(GradientBoostingClassifier(random_state = 2), parameters, cv = cv_stratify, scoring = 'f1')
gbc.fit(X_train, y_train)
print('GradientBoosting best params:', gbc.best_params_)
print('GradientBoosting best f1 score:', gbc.best_score_)

In [None]:
y_pred = gbc.best_estimator_.predict(X_test)
print('Gradient Boosting metrics')
print('f1:', f1_score(y_test, y_pred))
print('accuarcy:', accuracy_score(y_test, y_pred))
print('precision:', precision_score(y_test, y_pred))
print('recall:', recall_score(y_test, y_pred))

In [None]:
explainer = shap.TreeExplainer(gbc.best_estimator_)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar')

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values, X_test)

In [None]:
p.dump([rfc.best_estimator_, gbc.best_estimator_], open("startup_prediction_saved.p", "wb"))