In [1]:
!pip install pandas requests



In [2]:
import pandas as pd
import requests

url = 'https://raw.githubusercontent.com/KasperLoos/ConsultElement/main/data/bronze_data/startup_failures.csv'
response = requests.get(url)

# Save the content of the file
with open('startup_failures.csv', 'wb') as file:
    file.write(response.content)

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('startup_failures.csv')

df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['first_funding_at'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['last_funding_at'] = pd.to_datetime(df['last_funding_at'], errors='coerce')
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'].replace({'$': '', ',': ''}, regex=True), errors='coerce')

In [3]:
df

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000.0,operating,IND,16,Mumbai,Mumbai,1,NaT,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000.0,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878.0,operating,,,,,1,NaT,2014-01-30,2014-01-30
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000.0,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66363,/organization/zznode-science-and-technology-co...,ZZNode Science and Technology,http://www.zznode.com,Enterprise Software,1587301.0,operating,CHN,22,Beijing,Beijing,1,NaT,2012-04-01,2012-04-01
66364,/organization/zzzzapp-com,Zzzzapp Wireless ltd.,http://www.zzzzapp.com,Advertising|Mobile|Web Development|Wireless,114304.0,operating,HRV,15,Split,Split,4,2012-05-13,2011-11-01,2014-03-01
66365,/organization/Áeron,ÁERON,http://www.aeron.hu/,,,operating,,,,,1,2011-01-01,2014-08-01,2014-08-01
66366,/organization/Ôasys-2,Ôasys,http://www.oasys.io/,Consumer Electronics|Internet of Things|Teleco...,18192.0,operating,USA,CA,SF Bay Area,San Francisco,1,2014-01-01,2015-01-01,2015-01-01


In [4]:
# Calculate the time difference between 'founded_at' and 'first_funding_at'
df['time_before_first_funding'] = df['first_funding_at'] - df['founded_at']

# Calculate the time difference between 'first_funding_at' and 'last_funding_at'
df['time_between_first_last_funding'] = df['last_funding_at'] - df['first_funding_at']
df['founded_year'] = df['founded_at'].dt.year
df['founded_month'] = df['founded_at'].dt.month
df['funding_year'] = df['first_funding_at'].dt.year
df['funding_month'] = df['first_funding_at'].dt.month

In [5]:
df['average_funding'] = df['funding_total_usd'] / df['funding_rounds']

In [6]:
df['failed'] = df['status'].apply(lambda x: 1 if x == 'closed' else 0)
df['has_website'] = df['homepage_url'].apply(lambda x: 1 if pd.notna(x) and x.strip() != '' else 0)
df

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,...,last_funding_at,time_before_first_funding,time_between_first_last_funding,founded_year,founded_month,funding_year,funding_month,average_funding,failed,has_website
0,/organization/-fame,#fame,http://livfame.com,Media,10000000.0,operating,IND,16,Mumbai,Mumbai,...,2015-01-05,NaT,0 days,,,2015.0,1.0,10000000.0,0,1
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000.0,operating,USA,DE,DE - Other,Delaware City,...,2014-10-14,-187 days,227 days,2014.0,9.0,2014.0,3.0,350000.0,0,1
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878.0,operating,,,,,...,2014-01-30,NaT,0 days,,,2014.0,1.0,3406878.0,0,1
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000.0,operating,CHN,22,Beijing,Beijing,...,2008-03-19,443 days,0 days,2007.0,1.0,2008.0,3.0,2000000.0,0,1
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,,operating,USA,IL,"Springfield, Illinois",Champaign,...,2014-07-24,1665 days,0 days,2010.0,1.0,2014.0,7.0,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66363,/organization/zznode-science-and-technology-co...,ZZNode Science and Technology,http://www.zznode.com,Enterprise Software,1587301.0,operating,CHN,22,Beijing,Beijing,...,2012-04-01,NaT,0 days,,,2012.0,4.0,1587301.0,0,1
66364,/organization/zzzzapp-com,Zzzzapp Wireless ltd.,http://www.zzzzapp.com,Advertising|Mobile|Web Development|Wireless,114304.0,operating,HRV,15,Split,Split,...,2014-03-01,-194 days,851 days,2012.0,5.0,2011.0,11.0,28576.0,0,1
66365,/organization/Áeron,ÁERON,http://www.aeron.hu/,,,operating,,,,,...,2014-08-01,1308 days,0 days,2011.0,1.0,2014.0,8.0,,0,1
66366,/organization/Ôasys-2,Ôasys,http://www.oasys.io/,Consumer Electronics|Internet of Things|Teleco...,18192.0,operating,USA,CA,SF Bay Area,San Francisco,...,2015-01-01,365 days,0 days,2014.0,1.0,2015.0,1.0,18192.0,0,1


In [7]:
df['funding_total_usd'].fillna(df['funding_total_usd'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['funding_total_usd'].fillna(df['funding_total_usd'].mean(), inplace=True)


In [8]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [9]:
df

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,...,last_funding_at,time_before_first_funding,time_between_first_last_funding,founded_year,founded_month,funding_year,funding_month,average_funding,failed,has_website
0,/organization/-fame,#fame,http://livfame.com,Media,1.000000e+07,operating,IND,16,Mumbai,Mumbai,...,2015-01-05,NaT,0 days,,,2015.0,1.0,10000000.0,0,1
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,7.000000e+05,operating,USA,DE,DE - Other,Delaware City,...,2014-10-14,-187 days,227 days,2014.0,9.0,2014.0,3.0,350000.0,0,1
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3.406878e+06,operating,,,,,...,2014-01-30,NaT,0 days,,,2014.0,1.0,3406878.0,0,1
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2.000000e+06,operating,CHN,22,Beijing,Beijing,...,2008-03-19,443 days,0 days,2007.0,1.0,2008.0,3.0,2000000.0,0,1
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,1.847860e+07,operating,USA,IL,"Springfield, Illinois",Champaign,...,2014-07-24,1665 days,0 days,2010.0,1.0,2014.0,7.0,,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66363,/organization/zznode-science-and-technology-co...,ZZNode Science and Technology,http://www.zznode.com,Enterprise Software,1.587301e+06,operating,CHN,22,Beijing,Beijing,...,2012-04-01,NaT,0 days,,,2012.0,4.0,1587301.0,0,1
66364,/organization/zzzzapp-com,Zzzzapp Wireless ltd.,http://www.zzzzapp.com,Advertising|Mobile|Web Development|Wireless,1.143040e+05,operating,HRV,15,Split,Split,...,2014-03-01,-194 days,851 days,2012.0,5.0,2011.0,11.0,28576.0,0,1
66365,/organization/Áeron,ÁERON,http://www.aeron.hu/,,1.847860e+07,operating,,,,,...,2014-08-01,1308 days,0 days,2011.0,1.0,2014.0,8.0,,0,1
66366,/organization/Ôasys-2,Ôasys,http://www.oasys.io/,Consumer Electronics|Internet of Things|Teleco...,1.819200e+04,operating,USA,CA,SF Bay Area,San Francisco,...,2015-01-01,365 days,0 days,2014.0,1.0,2015.0,1.0,18192.0,0,1


In [10]:
df = df[['category_list','funding_total_usd','country_code','state_code','region','city', 'funding_rounds', 'time_before_first_funding', 'time_between_first_last_funding', 'founded_year', 'founded_month', 'funding_year', 'funding_month', 'average_funding', 'has_website', 'failed']]

In [11]:
df['time_before_first_funding'] = df['time_before_first_funding'].dt.days
df['time_between_first_last_funding'] = df['time_between_first_last_funding'].dt.days
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_before_first_funding'] = df['time_before_first_funding'].dt.days
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_between_first_last_funding'] = df['time_between_first_last_funding'].dt.days


Unnamed: 0,category_list,funding_total_usd,country_code,state_code,region,city,funding_rounds,time_before_first_funding,time_between_first_last_funding,founded_year,founded_month,funding_year,funding_month,average_funding,has_website,failed
0,Media,1.000000e+07,IND,16,Mumbai,Mumbai,1,,0.0,,,2015.0,1.0,10000000.0,1,0
1,Application Platforms|Real Time|Social Network...,7.000000e+05,USA,DE,DE - Other,Delaware City,2,-187.0,227.0,2014.0,9.0,2014.0,3.0,350000.0,1,0
2,Apps|Games|Mobile,3.406878e+06,,,,,1,,0.0,,,2014.0,1.0,3406878.0,1,0
3,Curated Web,2.000000e+06,CHN,22,Beijing,Beijing,1,443.0,0.0,2007.0,1.0,2008.0,3.0,2000000.0,1,0
4,Software,1.847860e+07,USA,IL,"Springfield, Illinois",Champaign,1,1665.0,0.0,2010.0,1.0,2014.0,7.0,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66363,Enterprise Software,1.587301e+06,CHN,22,Beijing,Beijing,1,,0.0,,,2012.0,4.0,1587301.0,1,0
66364,Advertising|Mobile|Web Development|Wireless,1.143040e+05,HRV,15,Split,Split,4,-194.0,851.0,2012.0,5.0,2011.0,11.0,28576.0,1,0
66365,,1.847860e+07,,,,,1,1308.0,0.0,2011.0,1.0,2014.0,8.0,,1,0
66366,Consumer Electronics|Internet of Things|Teleco...,1.819200e+04,USA,CA,SF Bay Area,San Francisco,1,365.0,0.0,2014.0,1.0,2015.0,1.0,18192.0,1,0


In [12]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Assuming df is already loaded and preprocessed with the required columns

# Define the features (X) and the target (y)
X = df[['category_list', 'funding_total_usd', 'country_code', 'state_code', 'region', 'city', 
        'funding_rounds', 'time_before_first_funding', 'time_between_first_last_funding', 
        'founded_year', 'founded_month', 'funding_year', 'funding_month', 'average_funding', 
        'has_website']]
y = df['failed']


# Handle categorical columns by label encoding
categorical_columns = ['category_list', 'country_code', 'state_code', 'region', 'city']
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].fillna('Unknown'))  # Ensure there are no NaN values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgb.XGBClassifier(eval_metric='logloss'),
                           param_grid=param_grid,
                           cv=3, scoring='accuracy', verbose=1)

grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

# Create the XGBoost model
model = xgb.XGBClassifier(scale_pos_weight=(len(y) - sum(y)) / sum(y), eval_metric='logloss')

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].fillna('Unknown'))  # Ensure there are no NaN values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].fillna('Unknown'))  # Ensure there are no NaN values
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = le.fit_transform(X[col].

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'subsample': 0.8}
Accuracy: 0.7799
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.80      0.87     11961
           1       0.25      0.60      0.35      1313

    accuracy                           0.78     13274
   macro avg       0.60      0.70      0.61     13274
weighted avg       0.88      0.78      0.82     13274



In [13]:
# Get predicted probabilities for the test set
y_pred_prob = model.predict_proba(X_test)[:, 1]  # Probability for the 'failed' class (class 1)

# Adjust the threshold for predicting 'failed=1'
y_pred_adjusted = (y_pred_prob > 0.2).astype(int)

# Evaluate the model using the adjusted predictions
print("Classification Report (with adjusted threshold):")
print(classification_report(y_test, y_pred_adjusted))

Classification Report (with adjusted threshold):
              precision    recall  f1-score   support

           0       0.97      0.53      0.68     11961
           1       0.16      0.85      0.27      1313

    accuracy                           0.56     13274
   macro avg       0.57      0.69      0.48     13274
weighted avg       0.89      0.56      0.64     13274

