In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path


In [None]:
# Paths
data_path = Path("big_startup_secsees_dataset.csv")
clean_output = Path("clean_df.csv")
model_output = Path("preprocessed_data.csv")


In [None]:
# Load data
df = pd.read_csv(data_path)


In [None]:
# Initial inspection
df.info()
print(df.describe(include='all'))
print(df.head())


In [None]:
# Nulls, duplicates, and target creation
print(df.isnull().sum())
print(f"Duplicate rows: {df.duplicated().sum()}")

df['success'] = df['status'].apply(lambda x: 0 if x == 'closed' else 1)


In [None]:
# Drop unnecessary columns
df.drop(columns=['permalink', 'homepage_url', 'name', 'status', 'state_code'], inplace=True)


In [None]:
# Handle missing values
fill_cols = ['category_list', 'country_code', 'region', 'city']
df[fill_cols] = df[fill_cols].fillna('Unknown')
df.dropna(subset=['first_funding_at'], inplace=True)


In [None]:
# Clean funding values
df['funding_total_usd'] = df['funding_total_usd'].replace('-', np.nan)
df['funding_total_usd'] = pd.to_numeric(df['funding_total_usd'], errors='coerce')


In [None]:
# Convert date columns and extract date features
df['founded_at'] = pd.to_datetime(df['founded_at'], errors='coerce')
df['first_funding_at'] = pd.to_datetime(df['first_funding_at'], errors='coerce')
df['last_funding_at'] = pd.to_datetime(df['last_funding_at'], errors='coerce')

df['founded_year'] = df['founded_at'].dt.year
df['first_funding_year'] = df['first_funding_at'].dt.year
df['last_funding_year'] = df['last_funding_at'].dt.year
df['days_to_first_funding'] = (df['first_funding_at'] - df['founded_at']).dt.days
df['funding_duration'] = (df['last_funding_at'] - df['first_funding_at']).dt.days


In [None]:
# Remove funding outliers > $5B
df = df[df['funding_total_usd'] <= 5_000_000_000]


In [None]:
# Remove implausible founding years
df = df[(df['founded_year'] >= 1990) & (df['founded_year'] <= 2015)]


In [None]:
# Drop remaining nulls in engineered columns
df.dropna(subset=[
    'first_funding_at', 'first_funding_year', 'days_to_first_funding', 'funding_duration'
], inplace=True)


In [None]:
# Simplify category_list into top N or "Other"
top_categories = df['category_list'].value_counts().nlargest(10).index
df['category_grouped'] = df['category_list'].apply(lambda x: x if x in top_categories else 'Other')


In [None]:
# TF-IDF + KMeans clustering of category_list
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

unique_categories = pd.Series(df['category_list'].dropna().unique())
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.95, min_df=2)
X = vectorizer.fit_transform(unique_categories)

n_clusters = 20
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X)

clustered_df = pd.DataFrame({
    'original_category': unique_categories,
    'cluster': clusters
})
category_to_cluster = dict(zip(clustered_df['original_category'], clustered_df['cluster']))
df['category_cluster'] = df['category_list'].map(category_to_cluster)


In [None]:
# Manually map cluster numbers to names (optional but illustrative)
cluster_name_map = {
    0: 'Social Apps', 1: 'Health & Wellness', 2: 'Social Networks', 3: 'Cloud & SaaS',
    4: 'Mobile & Games', 5: 'E-commerce & Fashion', 6: 'Web Development', 7: 'General Tech',
    8: 'Health IT', 9: 'Curated Media', 10: 'Consumer Hardware', 11: 'Education',
    12: 'Big Data & Analytics', 13: 'Enterprise Software', 14: 'Project & Investment Mgmt',
    15: 'Social Marketing', 16: 'Location & Finance', 17: 'Clean Tech & Info Services',
    18: '3D & Printing', 19: 'Marketing & Sales'
}
df['category_cluster_name'] = df['category_cluster'].map(cluster_name_map)


In [None]:
# Drop non-essential columns for ML
df_model = df.drop(columns=[
    'category_list', 'founded_at', 'first_funding_at', 'last_funding_at',
    'category_grouped', 'category_cluster_name'
])

# Type conversion
df_model = df_model.astype({
    'founded_year': int,
    'first_funding_year': int,
    'funding_total_usd': int
})


In [None]:
# Export cleaned datasets
df.to_csv(clean_output, index=False)
df_model.to_csv(model_output, index=False)
