In [140]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [141]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [142]:
train.head(10)

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,#fame,Media,10000000,operating,IND,16,Mumbai,Mumbai,1,,2015-01-05,2015-01-05
1,:Qounter,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2,2014-09-04,2014-03-01,2014-10-14
2,"(THE) ONE of THEM,Inc.",Apps|Games|Mobile,3406878,operating,,,,,1,,2014-01-30,2014-01-30
3,0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1,2007-01-01,2008-03-19,2008-03-19
4,004 Technologies,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1,2010-01-01,2014-07-24,2014-07-24
5,01Games Technology,Games,41250,operating,HKG,,Hong Kong,Hong Kong,1,,2014-07-01,2014-07-01
6,Ondine Biomedical Inc.,Biotechnology,762851,operating,CAN,BC,Vancouver,Vancouver,2,1997-01-01,2009-09-11,2009-12-21
7,H2O.ai,Analytics,33600000,operating,USA,CA,SF Bay Area,Mountain View,4,2011-01-01,2013-01-03,2015-11-09
8,"1,2,3 Listo",E-Commerce,40000,operating,CHL,12,Santiago,Las Condes,1,2012-01-01,2013-02-18,2013-02-18
9,1-4 All,Entertainment|Games|Software,-,operating,USA,NC,NC - Other,Connellys Springs,1,,2013-04-21,2013-04-21


In [143]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57616 entries, 0 to 57615
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               57615 non-null  object
 1   category_list      54468 non-null  object
 2   funding_total_usd  57616 non-null  object
 3   status             57616 non-null  object
 4   country_code       50658 non-null  object
 5   state_code         49069 non-null  object
 6   region             49586 non-null  object
 7   city               49588 non-null  object
 8   funding_rounds     57616 non-null  int64 
 9   founded_at         42395 non-null  object
 10  first_funding_at   57592 non-null  object
 11  last_funding_at    57616 non-null  object
dtypes: int64(1), object(11)
memory usage: 5.3+ MB


In [144]:
status_map = {
    "closed": 0,
    "operating": 1,
    "acquired": 2,
    "ipo": 2
}
train["status"] = train["status"].map(status_map)

In [145]:
for df in [train, test]:
    df["funding_total_usd"] = (
        df["funding_total_usd"]
        .replace('-', np.nan)
        .astype(float)
    )

median_funding = train["funding_total_usd"].median()
for df in [train, test]:
    df["funding_total_usd"].fillna(median_funding, inplace=True)
    df["log_funding_total_usd"] = np.log1p(df["funding_total_usd"])


In [146]:
date_cols = ["founded_at", "first_funding_at", "last_funding_at"]

for df in [train, test]:
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors="coerce")

for df in [train, test]:
    df["founded_year"] = df["founded_at"].dt.year
    df["first_funding_year"] = df["first_funding_at"].dt.year
    df["last_funding_year"] = df["last_funding_at"].dt.year
    
    df["age_days"] = (df["last_funding_at"] - df["founded_at"]).dt.days
    df["funding_duration_days"] = (df["last_funding_at"] - df["first_funding_at"]).dt.days
    df["time_to_first_fund_days"] = (df["first_funding_at"] - df["founded_at"]).dt.days

In [147]:
derived_cols = ["age_days", "funding_duration_days", "time_to_first_fund_days"]
for col in derived_cols:
    med = train[col].median()
    train[col].fillna(med, inplace=True)
    test[col].fillna(med, inplace=True)

In [148]:
train.drop(columns=date_cols, inplace=True)
test.drop(columns=date_cols, inplace=True)

In [149]:
cat_cols = ["category_list", "country_code", "state_code", "region", "city"]

In [150]:
for df in [train, test]:
    for col in cat_cols:
        df[col].fillna("Unknown", inplace=True)

In [151]:
from sklearn.preprocessing import MultiLabelBinarizer

def parse_categories(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    return [i.strip() for i in str(x).split("|") if i.strip()]

for df in [train, test]:
    df["category_list"] = df["category_list"].apply(parse_categories)

mlb = MultiLabelBinarizer()
mlb.fit(pd.concat([train["category_list"], test["category_list"]], axis=0))

train_mlb = pd.DataFrame(
    mlb.transform(train["category_list"]),
    columns=[f"cat_{c}" for c in mlb.classes_],
    index=train.index
)
test_mlb = pd.DataFrame(
    mlb.transform(test["category_list"]),
    columns=[f"cat_{c}" for c in mlb.classes_],
    index=test.index
)

train = pd.concat([train.drop(columns=["category_list"]), train_mlb], axis=1)
test = pd.concat([test.drop(columns=["category_list"]), test_mlb], axis=1)


In [152]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['country_code', 'state_code', 'region', 'city']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    label_encoders[col] = le

In [153]:
for df in [train, test]:
    df["funding_per_round"] = df["funding_total_usd"] / (df["funding_rounds"] + 1)
    df["log_funding_per_round"] = np.log1p(df["funding_per_round"])

In [154]:
for col in ["funding_per_round", "log_funding_per_round"]:
    med = train[col].median()
    train[col].fillna(med, inplace=True)
    test[col].fillna(med, inplace=True)

In [155]:
train.drop(columns=["name"], inplace=True)
test.drop(columns=["name"], inplace=True)

In [156]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [157]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=["status"])
y = train["status"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_test = test.copy()

In [158]:
X_train.head()

Unnamed: 0,funding_total_usd,country_code,state_code,region,city,funding_rounds,log_funding_total_usd,founded_year,first_funding_year,last_funding_year,...,cat_Women,cat_Writers,cat_Young Adults,cat_iOS,cat_iPad,cat_iPhone,cat_iPod Touch,cat_mHealth,funding_per_round,log_funding_per_round
5121,2000000.0,31,7,96,367,1,14.508658,2011.0,2012.0,2012,...,0,0,0,0,0,0,0,0,1000000.0,13.815512
6017,980000.0,20,252,662,2931,2,13.795309,2013.0,2014.0,2015,...,0,0,0,0,0,0,0,0,326666.666667,12.696699
36809,2000000.0,42,239,1014,4633,1,14.508658,2008.0,2010.0,2010,...,0,0,0,0,0,0,0,0,1000000.0,13.815512
18453,2000000.0,92,44,682,3228,1,14.508658,1999.0,2004.0,2004,...,0,0,0,0,0,0,0,0,1000000.0,13.815512
1034,3750000.0,59,44,972,4463,3,15.137267,2006.0,2008.0,2011,...,0,0,0,0,0,0,0,0,937500.0,13.750973


In [159]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [160]:
from sklearn.metrics import f1_score

f1 = f1_score(model.predict(X_val), y_val, average='macro')
f1

0.4564876571975351

In [165]:
submission = pd.DataFrame({'status': model.predict(X_test)})

In [166]:
submission

Unnamed: 0,status
0,1
1,1
2,1
3,1
4,1
...,...
8747,1
8748,1
8749,1
8750,1


In [167]:
import zipfile
import os

if not os.path.exists(os.path.join(os.getcwd(), 'startup.ipynb')):
    %notebook -e startup.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = ['startup3.ipynb', 'submission.csv']
compress(file_names)

File Paths:
['startup3.ipynb', 'submission.csv']
