# Descriptive, Inferential, and Exploratory Statistical Analysis

This notebook performs comprehensive statistical analysis on the Unicorn Companies dataset.

## Objectives
1. Descriptive Statistics
2. Inferential Statistics
3. Hypothesis Testing
4. Correlation Analysis
5. Statistical Tests


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency, f_oneway, pearsonr, spearmanr
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Load cleaned data
df = pd.read_csv('../../data/Unicorn_Companies_cleaned.csv')
print(f"Dataset shape: {df.shape}")


## 1. Descriptive Statistics


In [None]:
# Descriptive statistics for numerical variables
numeric_cols = ['Valuation_B', 'Total_Raised_B', 'Investors Count', 'Deal Terms', 
                'Portfolio Exits', 'Years_to_Unicorn']

print("Descriptive Statistics:")
desc_stats = df[numeric_cols].describe()
print(desc_stats)

# Additional statistics
print("\nAdditional Statistics:")
for col in numeric_cols:
    print(f"\n{col}:")
    print(f"  Skewness: {df[col].skew():.4f}")
    print(f"  Kurtosis: {df[col].kurtosis():.4f}")
    print(f"  Coefficient of Variation: {(df[col].std() / df[col].mean()):.4f}")
    print(f"  Median: {df[col].median():.4f}")
    print(f"  IQR: {df[col].quantile(0.75) - df[col].quantile(0.25):.4f}")


## 2. Inferential Statistics - Hypothesis Testing


In [None]:
# Test 1: Compare valuations between US and China
us_val = df[df['Country'] == 'United States']['Valuation_B'].dropna()
china_val = df[df['Country'] == 'China']['Valuation_B'].dropna()

print("Hypothesis Test 1: US vs China Valuations")
print(f"US Mean: {us_val.mean():.2f}B, China Mean: {china_val.mean():.2f}B")

# Two-sample t-test
t_stat, p_value = ttest_ind(us_val, china_val)
print(f"\nTwo-sample t-test:")
print(f"t-statistic: {t_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant at α=0.05: {p_value < 0.05}")


In [None]:
# Test 2: ANOVA - Compare valuations across top industries
top_industries = df['Industry'].value_counts().head(5).index
industry_groups = [df[df['Industry'] == ind]['Valuation_B'].dropna() for ind in top_industries]

print("Hypothesis Test 2: ANOVA - Valuations across Industries")
f_stat, p_value_anova = f_oneway(*industry_groups)
print(f"F-statistic: {f_stat:.4f}")
print(f"p-value: {p_value_anova:.4f}")
print(f"Significant at α=0.05: {p_value_anova < 0.05}")


In [None]:
# Test 3: Chi-square test - Country vs Financial Stage
contingency_table = pd.crosstab(df['Country'], df['Financial Stage'])
chi2, p_value_chi2, dof, expected = chi2_contingency(contingency_table)

print("Hypothesis Test 3: Chi-square test - Country vs Financial Stage")
print(f"Chi-square statistic: {chi2:.4f}")
print(f"p-value: {p_value_chi2:.4f}")
print(f"Degrees of freedom: {dof}")
print(f"Significant at α=0.05: {p_value_chi2 < 0.05}")


## 3. Correlation Analysis


In [None]:
# Correlation matrix
corr_cols = ['Valuation_B', 'Total_Raised_B', 'Investors Count', 'Deal Terms', 
             'Portfolio Exits', 'Years_to_Unicorn']
corr_matrix = df[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.savefig('../../results/plots/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

# Pearson correlation tests
print("Pearson Correlation Tests:")
print(f"Valuation vs Total Raised: r={pearsonr(df['Valuation_B'].dropna(), df['Total_Raised_B'].dropna())[0]:.4f}, p={pearsonr(df['Valuation_B'].dropna(), df['Total_Raised_B'].dropna())[1]:.4f}")
print(f"Valuation vs Investors Count: r={pearsonr(df['Valuation_B'].dropna(), df['Investors Count'].dropna())[0]:.4f}, p={pearsonr(df['Valuation_B'].dropna(), df['Investors Count'].dropna())[1]:.4f}")
print(f"Total Raised vs Years to Unicorn: r={pearsonr(df['Total_Raised_B'].dropna(), df['Years_to_Unicorn'].dropna())[0]:.4f}, p={pearsonr(df['Total_Raised_B'].dropna(), df['Years_to_Unicorn'].dropna())[1]:.4f}")


## 4. Confidence Intervals


In [None]:
# Calculate 95% confidence intervals for mean valuation
mean_val = df['Valuation_B'].mean()
std_val = df['Valuation_B'].std()
n = len(df['Valuation_B'].dropna())
se = std_val / np.sqrt(n)
confidence_level = 0.95
alpha = 1 - confidence_level
t_critical = stats.t.ppf(1 - alpha/2, n-1)

ci_lower = mean_val - t_critical * se
ci_upper = mean_val + t_critical * se

print(f"95% Confidence Interval for Mean Valuation:")
print(f"Mean: {mean_val:.2f}B")
print(f"Confidence Interval: [{ci_lower:.2f}B, {ci_upper:.2f}B]")
print(f"Margin of Error: {t_critical * se:.2f}B")
