# Machine Learning Analysis: Unicorn Companies

This notebook performs comprehensive machine learning analysis on the Unicorn Companies dataset.

## Objectives
1. Data preprocessing for ML
2. Feature engineering
3. Regression models to predict Valuation
4. Classification models to predict Financial Stage
5. Model evaluation and comparison
6. Feature importance analysis


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, classification_report, confusion_matrix
from xgboost import XGBRegressor, XGBClassifier
import warnings
import os
import joblib

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Create results directory
os.makedirs('../../results/models', exist_ok=True)
os.makedirs('../../results/plots', exist_ok=True)


In [None]:
# Load cleaned data
df = pd.read_csv('../../data/Unicorn_Companies_cleaned.csv')
print(f"Dataset shape: {df.shape}")
df.head()


## Data Preprocessing


In [None]:
# Feature engineering
df_ml = df.copy()

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Country', 'Industry', 'Financial Stage']

for col in categorical_cols:
    le = LabelEncoder()
    df_ml[f'{col}_encoded'] = le.fit_transform(df_ml[col].astype(str))
    label_encoders[col] = le

# Select features for modeling
feature_cols = ['Total_Raised_B', 'Investors Count', 'Deal Terms', 
                'Portfolio Exits', 'Years_to_Unicorn', 'Founded_Year',
                'Country_encoded', 'Industry_encoded']

# Remove rows with missing target values
df_ml = df_ml.dropna(subset=['Valuation_B'])

# Fill remaining missing values
df_ml[feature_cols] = df_ml[feature_cols].fillna(df_ml[feature_cols].median())

print(f"Final dataset shape: {df_ml.shape}")
print(f"Features: {feature_cols}")


## 1. Regression: Predict Valuation


In [None]:
# Prepare data for regression
X_reg = df_ml[feature_cols]
y_reg = df_ml['Valuation_B']

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Scale features
scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

print(f"Training set size: {X_train_reg.shape[0]}")
print(f"Test set size: {X_test_reg.shape[0]}")


In [None]:
# Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_reg, y_train_reg)

y_pred_rf = rf_reg.predict(X_test_reg)
mse_rf = mean_squared_error(y_test_reg, y_pred_rf)
r2_rf = r2_score(y_test_reg, y_pred_rf)

print(f"Random Forest Regressor:")
print(f"  MSE: {mse_rf:.4f}")
print(f"  RMSE: {np.sqrt(mse_rf):.4f}")
print(f"  R² Score: {r2_rf:.4f}")


In [None]:
# XGBoost Regressor
xgb_reg = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
xgb_reg.fit(X_train_reg, y_train_reg)

y_pred_xgb = xgb_reg.predict(X_test_reg)
mse_xgb = mean_squared_error(y_test_reg, y_pred_xgb)
r2_xgb = r2_score(y_test_reg, y_pred_xgb)

print(f"XGBoost Regressor:")
print(f"  MSE: {mse_xgb:.4f}")
print(f"  RMSE: {np.sqrt(mse_xgb):.4f}")
print(f"  R² Score: {r2_xgb:.4f}")


In [None]:
# Linear Regression
lr_reg = LinearRegression()
lr_reg.fit(X_train_reg_scaled, y_train_reg)

y_pred_lr = lr_reg.predict(X_test_reg_scaled)
mse_lr = mean_squared_error(y_test_reg, y_pred_lr)
r2_lr = r2_score(y_test_reg, y_pred_lr)

print(f"Linear Regression:")
print(f"  MSE: {mse_lr:.4f}")
print(f"  RMSE: {np.sqrt(mse_lr):.4f}")
print(f"  R² Score: {r2_lr:.4f}")


In [None]:
# Visualize predictions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

models = [('Random Forest', y_pred_rf), ('XGBoost', y_pred_xgb), ('Linear Regression', y_pred_lr)]

for idx, (name, y_pred) in enumerate(models):
    axes[idx].scatter(y_test_reg, y_pred, alpha=0.5)
    axes[idx].plot([y_test_reg.min(), y_test_reg.max()], 
                   [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
    axes[idx].set_xlabel('Actual Valuation ($B)')
    axes[idx].set_ylabel('Predicted Valuation ($B)')
    axes[idx].set_title(f'{name} - Predictions vs Actual')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../results/plots/ml_regression_predictions.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Feature importance for Random Forest
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_reg.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance - Random Forest Regressor')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('../../results/plots/ml_feature_importance_regression.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nFeature Importance:")
print(feature_importance)


## 2. Classification: Predict Financial Stage


In [None]:
# Prepare data for classification
df_class = df_ml[df_ml['Financial Stage'].notna()].copy()

X_class = df_class[feature_cols]
y_class = df_class['Financial Stage_encoded']

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Scale features
scaler_class = StandardScaler()
X_train_class_scaled = scaler_class.fit_transform(X_train_class)
X_test_class_scaled = scaler_class.transform(X_test_class)

print(f"Training set size: {X_train_class.shape[0]}")
print(f"Test set size: {X_test_class.shape[0]}")
print(f"Number of classes: {len(np.unique(y_class))}")


In [None]:
# Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_clf.fit(X_train_class, y_train_class)

y_pred_rf_clf = rf_clf.predict(X_test_class)

print("Random Forest Classifier:")
print(classification_report(y_test_class, y_pred_rf_clf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_rf_clf))


In [None]:
# XGBoost Classifier
xgb_clf = XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)
xgb_clf.fit(X_train_class, y_train_class)

y_pred_xgb_clf = xgb_clf.predict(X_test_class)

print("XGBoost Classifier:")
print(classification_report(y_test_class, y_pred_xgb_clf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_xgb_clf))


In [None]:
# Logistic Regression
lr_clf = LogisticRegression(random_state=42, max_iter=1000, n_jobs=-1)
lr_clf.fit(X_train_class_scaled, y_train_class)

y_pred_lr_clf = lr_clf.predict(X_test_class_scaled)

print("Logistic Regression:")
print(classification_report(y_test_class, y_pred_lr_clf))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_lr_clf))


In [None]:
# Feature importance for classification
feature_importance_clf = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_clf.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_clf, x='importance', y='feature')
plt.title('Feature Importance - Random Forest Classifier')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('../../results/plots/ml_feature_importance_classification.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nFeature Importance:")
print(feature_importance_clf)


In [None]:
# Save models
joblib.dump(rf_reg, '../../results/models/rf_regressor.pkl')
joblib.dump(xgb_reg, '../../results/models/xgb_regressor.pkl')
joblib.dump(rf_clf, '../../results/models/rf_classifier.pkl')
joblib.dump(xgb_clf, '../../results/models/xgb_classifier.pkl')
joblib.dump(scaler_reg, '../../results/models/scaler_reg.pkl')
joblib.dump(scaler_class, '../../results/models/scaler_class.pkl')
joblib.dump(label_encoders, '../../results/models/label_encoders.pkl')

print("Models saved successfully!")
