In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import random

In [44]:
df = pd.read_csv('startup.csv')

In [45]:
def str_to_float(row):
    if '-' in row:
        row = np.nan
    else:
        row = float(row)
    return row
        
df['funding_total_usd'] = df['funding_total_usd'].apply(str_to_float)
df['funding_total_usd'].dtypes

dtype('float64')

In [46]:
df['funding_total_usd'].aggregate([np.nanmean,np.nanmedian,np.nanstd])

nanmean      1.847860e+07
nanmedian    2.000000e+06
nanstd       1.880133e+08
Name: funding_total_usd, dtype: float64

In [47]:
funding_rounds_group = df.groupby('funding_rounds')['funding_total_usd'].aggregate(['count',np.nanmean,np.nanmedian,np.nanstd])

In [48]:
funding_filled = []

for h,i in enumerate(df['funding_total_usd']):
    if np.isnan(i):
        i = int(funding_rounds_group.loc[df['funding_rounds'].iloc[h],'nanmean'])
        funding_filled.append(i)
    else:
        funding_filled.append(int(i))
    
df['funding_filled'] = funding_filled

In [49]:
status = []

for i in df['status']:
    if i in ['acquired','ipo']:
        status.append('success')
    elif i == 'closed':
        status.append('fail')
    else:
        status.append('operating')
        
df['status_class'] = status

In [50]:
status_class = pd.get_dummies(df['status_class'])
df = pd.concat([df,status_class],axis=1)

In [51]:
country = []

for i in df.country_code:
  if not isinstance(i,str):
    country.append(np.random.choice(df.country_code[df.country_code.notnull()]))
  else:
    country.append(i)

df['country'] = country

In [52]:
# Original column
country_code_column = pd.DataFrame()
country_code_column['count'] = df.country_code.value_counts()
country_code_column['count/Total'] = df.country_code.value_counts()/df.country_code.count()
country_code_column['fail'] = df.groupby('country_code')['fail'].sum()/df.country_code.value_counts()
country_code_column['success'] = df.groupby('country_code')['success'].sum()/df.country_code.value_counts()
country_code_column = country_code_column[:10]

# Our column
country_column = pd.DataFrame()
country_column['count'] = df.country.value_counts()
country_column['count/Total'] = df.country.value_counts()/df.country.count()
country_column['fail'] = df.groupby('country')['fail'].sum()/df.country.value_counts()
country_column['success'] = df.groupby('country')['success'].sum()/df.country.value_counts()
country_column = country_column[:10]

# Let's see de difference between both columns
country_column - country_code_column

Unnamed: 0,count,count/Total,fail,success
USA,4367,-0.000554,0.016379,-0.009376
GBR,452,0.000302,0.018734,-0.002595
CAN,218,-0.000112,0.016849,-0.008791
IND,204,0.000257,0.0201,0.004073
CHN,191,0.000153,0.018925,-0.002941
FRA,129,-5.9e-05,0.020298,-0.004137
DEU,119,-0.000116,0.017928,-0.004528
ISR,116,4.5e-05,0.02289,-0.008683
ESP,93,8.5e-05,0.012836,-0.001773
AUS,45,-0.00021,0.009077,-0.002554


In [53]:
years = []

for fou, first in zip(df.founded_at, df.first_funding_at):
  if isinstance(fou,str):
    years.append(int(fou.split('-')[0]))
  elif not isinstance(fou,str) and isinstance(first,str):
    years.append(int(first.split('-')[0]))
  else:
    date = int(np.random.choice(df.founded_at[df.founded_at.notnull()]).split('-')[0])
    years.append(date)

df['year'] = years

In [54]:
main_category = []

for i in df['category_list']:
    if not isinstance(i,str):
      main_category.append('Other')
    else:
      main_category.append(i.split('|')[0])

df['main_category'] = main_category

In [55]:
df = df[['country','year','main_category','funding_rounds','funding_filled',
               'status_class']]

In [56]:
df.dropna(inplace=True)

In [57]:
unique_categories = df['main_category'].unique()
selected_categories = random.sample(unique_categories.tolist(), 50)

filtered_df = df[df['main_category'].isin(selected_categories)]

valid_selected_categories = []
for category in selected_categories:
    if (filtered_df['main_category'] == category).sum() > 1:
        valid_selected_categories.append(category)

filtered_df = filtered_df[filtered_df['main_category'].isin(valid_selected_categories)]

In [58]:
label_encoders = {}
categorical_cols = ['country', 'main_category']
for col in categorical_cols:
    le = LabelEncoder()
    filtered_df[col] = le.fit_transform(filtered_df[col])
    label_encoders[col] = le

In [59]:
import pandas as pd
import random
from imblearn.over_sampling import SMOTE

X = filtered_df.drop('status_class', axis=1)  # Features
y = filtered_df['status_class']  # Target variable

sm = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)
balanced_df = pd.concat([X_resampled, y_resampled], axis=1)
balanced_df.reset_index(drop=True, inplace=True)

   country  year  main_category  funding_rounds  funding_filled status_class
0       29  2008              3               1        10070591    operating
1        2  2012              2               3          440000    operating
2       23  2008             40               1          130636    operating
3       29  2012              6               1        10070591      success
4       65  2012             22               1          600000    operating


In [62]:
X = balanced_df[['country','year','main_category','funding_rounds','funding_filled']]
y = balanced_df['status_class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [63]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [64]:
pred = rfc.predict(X_test)

In [65]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, pred)
print(accuracy)
print(classification_report(y_test,pred))

0.8181818181818182
              precision    recall  f1-score   support

        fail       0.80      0.86      0.83       719
   operating       0.84      0.75      0.79       744
     success       0.82      0.85      0.84       726

    accuracy                           0.82      2189
   macro avg       0.82      0.82      0.82      2189
weighted avg       0.82      0.82      0.82      2189



In [66]:
import pickle
pickle.dump(rfc, open('model.pkl','wb'))

In [67]:
# Print the mapping of labels to encoded values
for col, le in label_encoders.items():
    print(f'{col} mapping:')
    for class_label, encoded_value in enumerate(le.classes_):
        print(f'{class_label} -> {encoded_value}')

country mapping:
0 -> ARE
1 -> ARG
2 -> AUS
3 -> AUT
4 -> BEL
5 -> BGR
6 -> BRA
7 -> CAN
8 -> CHE
9 -> CHL
10 -> CHN
11 -> COL
12 -> CRI
13 -> CYM
14 -> CZE
15 -> DEU
16 -> DNK
17 -> DOM
18 -> ECU
19 -> EGY
20 -> ESP
21 -> EST
22 -> FIN
23 -> FRA
24 -> GBR
25 -> GRD
26 -> HKG
27 -> HUN
28 -> IDN
29 -> IND
30 -> IRL
31 -> ISR
32 -> ITA
33 -> JOR
34 -> JPN
35 -> KOR
36 -> LAO
37 -> LBN
38 -> LTU
39 -> LUX
40 -> LVA
41 -> MEX
42 -> MMR
43 -> MYS
44 -> NLD
45 -> NOR
46 -> NZL
47 -> PAK
48 -> PAN
49 -> PER
50 -> PHL
51 -> POL
52 -> PRT
53 -> ROM
54 -> RUS
55 -> SGP
56 -> SLV
57 -> SRB
58 -> SVN
59 -> SWE
60 -> THA
61 -> TUR
62 -> TWN
63 -> UKR
64 -> URY
65 -> USA
66 -> VNM
67 -> ZAF
main_category mapping:
0 -> Adventure Travel
1 -> All Students
2 -> Audio
3 -> Beauty
4 -> Call Center Automation
5 -> Cannabis
6 -> Chat
7 -> Cloud-Based Music
8 -> Collaborative Consumption
9 -> College Recruiting
10 -> Colleges
11 -> Comics
12 -> Communications Infrastructure
13 -> Console Gaming
14 -> Contac