In [4]:
# Import necessary libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = "startup_growth_investment_data.csv"  # Ensure the file is in the same directory
df = pd.read_csv(file_path)

# Select features and target variable
X = df.drop(columns=["Startup Name", "Valuation (USD)", "Industry", "Country"])  # Drop non-numeric columns
y = df["Valuation (USD)"]

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a simple model (Linear Regression as baseline)
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Calculate performance metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R² Score: {r2:.4f}")

# Perform k-fold cross-validation (k=10)
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_rmse = cross_val_score(model, X, y, cv=kf, scoring="neg_root_mean_squared_error")
cv_rmse_mean = -cv_rmse.mean()

print(f"\n10-Fold Cross-Validation RMSE: {cv_rmse_mean:.2f}")


Root Mean Square Error (RMSE): 2909387636.38
Mean Absolute Error (MAE): 2181459266.52
R² Score: 0.7074

10-Fold Cross-Validation RMSE: 2917420238.65


