## Import Libraries

In [4]:

import pandas as pd
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import warnings
warnings.filterwarnings("ignore")

## Load Dataset

In [5]:
data = pd.read_csv('startscan.csv')

# Basic cleaning
data = data.drop_duplicates()

# Remove completely empty columns
data = data.dropna(axis=1, how='all')

# Strip column names (safety)
data.columns = data.columns.str.strip()

print("Dataset Shape:", data.shape)
data.head()

Dataset Shape: (923, 49)


Unnamed: 0.1,Unnamed: 0,state_code,latitude,longitude,zip_code,id,city,Unnamed: 6,name,labels,...,object_id,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,status
0,1005,CA,42.35888,-71.05682,92101,c:6669,San Diego,,Bandsintown,1,...,c:6669,0,1,0,0,0,0,1.0,0,acquired
1,204,CA,37.238916,-121.973718,95032,c:16283,Los Gatos,,TriCipher,1,...,c:16283,1,0,0,1,1,1,4.75,1,acquired
2,1001,CA,32.901049,-117.192656,92121,c:65620,San Diego,San Diego CA 92121,Plixi,1,...,c:65620,0,0,1,0,0,0,4.0,1,acquired
3,738,CA,37.320309,-122.05004,95014,c:42668,Cupertino,Cupertino CA 95014,Solidcore Systems,1,...,c:42668,0,0,0,1,1,1,3.3333,1,acquired
4,1002,CA,37.779281,-122.419236,94105,c:65806,San Francisco,San Francisco CA 94105,Inhale Digital,0,...,c:65806,1,1,0,0,0,0,1.0,1,closed


## Data Information

In [6]:
data = pd.read_csv('startscan.csv')

# Basic cleaning
data = data.drop_duplicates()
data = data.dropna(axis=1, how='all')
data.columns = data.columns.str.strip()

# State grouping
top_states = ['CA', 'NY', 'MA', 'TX', 'WA']
data['state'] = data['state_code'].apply(
    lambda x: x if x in top_states else 'Other'
)

## Changing 'status' data value

In [7]:
# Target variable encoding
data['status'] = data['status'].map({'acquired': 1, 'closed': 0})

data = data.dropna(subset=['status'])
data['status'] = data['status'].astype(int)

In [8]:
data['status'].astype(int)

0      1
1      1
2      1
3      1
4      0
      ..
918    1
919    0
920    0
921    1
922    1
Name: status, Length: 923, dtype: int32

### Drop column labels

In [9]:
# Drop unnecessary column if exists
if 'labels' in data.columns:
    data.drop('labels', axis=1, inplace=True)

## Categorical Value Counting

### Handling Missing Values

In [10]:
null=pd.DataFrame(data.isnull().sum(),columns=["Null Values"])
null["% Missing Values"]=(data.isna().sum()/len(data)*100)
null = null[null["% Missing Values"] > 0]
#print(null)
null.style.background_gradient(cmap='viridis',low =0.2,high=0.1)

Unnamed: 0,Null Values,% Missing Values
Unnamed: 6,493,53.412784
closed_at,588,63.705309
age_first_milestone_year,152,16.468039
age_last_milestone_year,152,16.468039
state_code.1,1,0.108342


In [11]:
# Fill missing numeric milestone ages with 0
data['age_first_milestone_year'] = data['age_first_milestone_year'].fillna(0)
data['age_last_milestone_year'] = data['age_last_milestone_year'].fillna(0)

In [12]:
# Drop duplicate state column if exists
if 'state_code.1' in data.columns:
    data.drop('state_code.1', axis=1, inplace=True)

## Correlation heatmap

In [13]:
# Ensure milestone columns are numeric
data['age_first_milestone_year'] = data['age_first_milestone_year'].astype(float)
data['age_last_milestone_year'] = data['age_last_milestone_year'].astype(float)

In [14]:
# Remove negative age values
data = data[
    (data['age_first_funding_year'] >= 0) &
    (data['age_last_funding_year'] >= 0) &
    (data['age_first_milestone_year'] >= 0) &
    (data['age_last_milestone_year'] >= 0)
]

# Create Model

In [15]:
# Drop unnecessary columns safely
cols_to_drop = [
    'category_code','is_software','is_web','is_mobile','is_enterprise',
    'is_advertising','is_gamesvideo','is_ecommerce','is_biotech',
    'is_consulting','is_othercategory','latitude','longitude',
    'Unnamed: 0','state_code','zip_code','id','city','Unnamed: 6',
    'name','founded_at','closed_at','is_CA','is_NY','is_MA','is_TX',
    'is_otherstate','object_id','has_VC','has_angel','has_roundA',
    'has_roundB','has_roundC','has_roundD','is_top500',
    'first_funding_at','last_funding_at'
]

data = data.drop(columns=[col for col in cols_to_drop if col in data.columns])

In [24]:
from sklearn.model_selection import train_test_split

# Define features and target
FEATURES = [
    'age_first_funding_year',
    'age_last_funding_year',
    'age_first_milestone_year',
    'age_last_milestone_year',
    'relationships',
    'funding_rounds',
    'funding_total_usd',
    'milestones',
    'avg_participants'
]

X = data[FEATURES]
y = data['status']

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [18]:
# Metrics
from sklearn.metrics import accuracy_score, classification_report

## Random Forest

In [25]:
FEATURES = [
    'age_first_funding_year',
    'age_last_funding_year',
    'age_first_milestone_year',
    'age_last_milestone_year',
    'relationships',
    'funding_rounds',
    'funding_total_usd',
    'milestones',
    'avg_participants'
]

X = data[FEATURES]
y = data['status']

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print("Final Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")

import joblib
joblib.dump(rf, 'random_forest_model.pkl')

Final Accuracy: 79.76 %


['random_forest_model.pkl']