# Statistical Analysis - Position Salaries Dataset

This notebook contains comprehensive statistical analysis including:
1. Descriptive Statistics
2. Inferential Statistics
3. Exploratory Statistical Analysis

## Objectives
- Perform descriptive statistical analysis
- Conduct inferential statistical tests
- Explore statistical relationships and patterns
- Test hypotheses about the data


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import ttest_1samp, pearsonr, spearmanr, normaltest, shapiro
from statsmodels.stats import weightstats as stests
from statsmodels.stats.proportion import proportions_ztest
import warnings
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

# Load data
project_root = Path().resolve().parent.parent.parent
data_path = project_root / "data" / "raw" / "Position_Salaries.csv"
df = pd.read_csv(data_path)

print("Data loaded successfully!")
print(f"Shape: {df.shape}")


## 1. Descriptive Statistics


In [None]:
# Descriptive Statistics for Salary
print("=" * 60)
print("DESCRIPTIVE STATISTICS - SALARY")
print("=" * 60)

salary_stats = df['Salary'].describe()
print("\nBasic Statistics:")
print(salary_stats)

print(f"\nAdditional Statistics:")
print(f"Mean: ${df['Salary'].mean():,.2f}")
print(f"Median: ${df['Salary'].median():,.2f}")
print(f"Mode: ${df['Salary'].mode().values[0]:,.2f}" if len(df['Salary'].mode()) > 0 else "No unique mode")
print(f"Standard Deviation: ${df['Salary'].std():,.2f}")
print(f"Variance: ${df['Salary'].var():,.2f}")
print(f"Range: ${df['Salary'].max() - df['Salary'].min():,.2f}")
print(f"Coefficient of Variation: {(df['Salary'].std() / df['Salary'].mean()) * 100:.2f}%")

# Quartiles and IQR
Q1 = df['Salary'].quantile(0.25)
Q2 = df['Salary'].quantile(0.50)
Q3 = df['Salary'].quantile(0.75)
IQR = Q3 - Q1

print(f"\nQuartiles:")
print(f"Q1 (25th percentile): ${Q1:,.2f}")
print(f"Q2 (50th percentile/Median): ${Q2:,.2f}")
print(f"Q3 (75th percentile): ${Q3:,.2f}")
print(f"IQR: ${IQR:,.2f}")

# Skewness and Kurtosis
print(f"\nShape Statistics:")
print(f"Skewness: {df['Salary'].skew():.4f}")
print(f"Kurtosis: {df['Salary'].kurtosis():.4f}")


In [None]:
# Descriptive Statistics for Level
print("=" * 60)
print("DESCRIPTIVE STATISTICS - LEVEL")
print("=" * 60)

level_stats = df['Level'].describe()
print("\nBasic Statistics:")
print(level_stats)

print(f"\nAdditional Statistics:")
print(f"Mean: {df['Level'].mean():.2f}")
print(f"Median: {df['Level'].median():.2f}")
print(f"Standard Deviation: {df['Level'].std():.2f}")
print(f"Variance: {df['Level'].var():.2f}")
print(f"Range: {df['Level'].max() - df['Level'].min()}")
print(f"Skewness: {df['Level'].skew():.4f}")
print(f"Kurtosis: {df['Level'].kurtosis():.4f}")


## 2. Inferential Statistics


In [None]:
# Normality Tests
print("=" * 60)
print("NORMALITY TESTS")
print("=" * 60)

# Shapiro-Wilk Test (for small samples)
stat_sw, p_value_sw = shapiro(df['Salary'])
print(f"\nShapiro-Wilk Test for Salary:")
print(f"  Test Statistic: {stat_sw:.4f}")
print(f"  p-value: {p_value_sw:.4f}")
print(f"  Result: {'Data appears to be normally distributed' if p_value_sw > 0.05 else 'Data does NOT appear to be normally distributed'} (α=0.05)")

# D'Agostino's Normality Test
stat_da, p_value_da = normaltest(df['Salary'])
print(f"\nD'Agostino's Normality Test for Salary:")
print(f"  Test Statistic: {stat_da:.4f}")
print(f"  p-value: {p_value_da:.4f}")
print(f"  Result: {'Data appears to be normally distributed' if p_value_da > 0.05 else 'Data does NOT appear to be normally distributed'} (α=0.05)")


In [None]:
# Correlation Tests
print("=" * 60)
print("CORRELATION TESTS")
print("=" * 60)

# Pearson Correlation
pearson_corr, pearson_p = pearsonr(df['Level'], df['Salary'])
print(f"\nPearson Correlation (Level vs Salary):")
print(f"  Correlation Coefficient: {pearson_corr:.4f}")
print(f"  p-value: {pearson_p:.4f}")
print(f"  Result: {'Significant correlation' if pearson_p < 0.05 else 'No significant correlation'} (α=0.05)")

# Spearman Correlation (non-parametric)
spearman_corr, spearman_p = spearmanr(df['Level'], df['Salary'])
print(f"\nSpearman Correlation (Level vs Salary):")
print(f"  Correlation Coefficient: {spearman_corr:.4f}")
print(f"  p-value: {spearman_p:.4f}")
print(f"  Result: {'Significant correlation' if spearman_p < 0.05 else 'No significant correlation'} (α=0.05)")


In [None]:
# One-Sample T-Test
# Test if mean salary is significantly different from a hypothesized value
hypothesized_mean = 200000
t_stat, p_value = ttest_1samp(df['Salary'], hypothesized_mean)

print("=" * 60)
print("ONE-SAMPLE T-TEST")
print("=" * 60)
print(f"\nHypothesis: Mean salary = ${hypothesized_mean:,}")
print(f"Sample Mean: ${df['Salary'].mean():,.2f}")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Result: {'Reject H0' if p_value < 0.05 else 'Fail to reject H0'} (α=0.05)")
print(f"  Interpretation: Mean salary is {'significantly different' if p_value < 0.05 else 'not significantly different'} from ${hypothesized_mean:,}")


## 3. Exploratory Statistical Analysis


In [None]:
# Confidence Intervals
from scipy import stats

confidence_level = 0.95
alpha = 1 - confidence_level
n = len(df['Salary'])
mean = df['Salary'].mean()
std_err = stats.sem(df['Salary'])

# Calculate confidence interval
ci = stats.t.interval(confidence_level, n-1, loc=mean, scale=std_err)

print("=" * 60)
print("CONFIDENCE INTERVALS")
print("=" * 60)
print(f"\n95% Confidence Interval for Mean Salary:")
print(f"  Lower bound: ${ci[0]:,.2f}")
print(f"  Upper bound: ${ci[1]:,.2f}")
print(f"  Mean: ${mean:,.2f}")
print(f"  Margin of Error: ${(ci[1] - ci[0])/2:,.2f}")


In [None]:
# Statistical Summary Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Distribution with mean and median
axes[0, 0].hist(df['Salary'], bins=10, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 0].axvline(df['Salary'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${df["Salary"].mean():,.0f}')
axes[0, 0].axvline(df['Salary'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ${df["Salary"].median():,.0f}')
axes[0, 0].set_title('Salary Distribution with Mean and Median', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Salary', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Q-Q Plot for normality check
stats.probplot(df['Salary'], dist="norm", plot=axes[0, 1])
axes[0, 1].set_title('Q-Q Plot for Salary (Normality Check)', fontsize=12, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# 3. Box plot with outliers
axes[1, 0].boxplot(df['Salary'], vert=True)
axes[1, 0].set_title('Box Plot of Salary (Outlier Detection)', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Salary', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# 4. Correlation with confidence interval
axes[1, 1].scatter(df['Level'], df['Salary'], s=100, alpha=0.7, color='coral', edgecolors='black')
z = np.polyfit(df['Level'], df['Salary'], 1)
p = np.poly1d(z)
axes[1, 1].plot(df['Level'], p(df['Level']), "r--", alpha=0.8, linewidth=2, 
                label=f'Linear Fit (r={pearson_corr:.3f})')
axes[1, 1].set_title(f'Level vs Salary (Correlation: {pearson_corr:.4f})', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Level', fontsize=10)
axes[1, 1].set_ylabel('Salary', fontsize=10)
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'statistical_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Summary of Statistical Analysis
print("=" * 60)
print("STATISTICAL ANALYSIS SUMMARY")
print("=" * 60)
print(f"\n1. Descriptive Statistics:")
print(f"   - Mean Salary: ${df['Salary'].mean():,.2f}")
print(f"   - Median Salary: ${df['Salary'].median():,.2f}")
print(f"   - Standard Deviation: ${df['Salary'].std():,.2f}")
print(f"   - Coefficient of Variation: {(df['Salary'].std() / df['Salary'].mean()) * 100:.2f}%")

print(f"\n2. Normality:")
print(f"   - Shapiro-Wilk p-value: {p_value_sw:.4f}")
print(f"   - Data is {'normally distributed' if p_value_sw > 0.05 else 'not normally distributed'}")

print(f"\n3. Correlation:")
print(f"   - Pearson Correlation: {pearson_corr:.4f} (p={pearson_p:.4f})")
print(f"   - Spearman Correlation: {spearman_corr:.4f} (p={spearman_p:.4f})")
print(f"   - Relationship: {'Strong positive correlation' if abs(pearson_corr) > 0.7 else 'Moderate correlation' if abs(pearson_corr) > 0.3 else 'Weak correlation'}")

print(f"\n4. Confidence Interval (95%):")
print(f"   - Lower: ${ci[0]:,.2f}")
print(f"   - Upper: ${ci[1]:,.2f}")
