In [1]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Load dataset (ensure it's already cleaned)
file_path = "startup_growth_investment_data.csv"  
df = pd.read_csv(file_path)

# Apply log transformation to valuation
df["Valuation (USD)"] = np.log1p(df["Valuation (USD)"])

# One-Hot Encode Categorical Features
df = pd.get_dummies(df, columns=["Industry", "Country"], drop_first=True)

# Select numerical features for scaling
features_to_scale = ["Investment Amount (USD)", "Funding Rounds", "Number of Investors", "Growth Rate (%)"]
scaler = StandardScaler()
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Split data into training and test sets
X = df.drop(columns=["Startup Name", "Valuation (USD)"])  # Drop non-useful columns
y = df["Valuation (USD)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print dataset shapes
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (4000, 21), Test set: (1000, 21)
