# Univariate, Bivariate, and Multivariate Analysis - Position Salaries Dataset

This notebook contains comprehensive analysis:
1. **Univariate Analysis**: Analysis of individual variables
2. **Bivariate Analysis**: Analysis of relationships between two variables
3. **Multivariate Analysis**: Analysis of multiple variables together

## Objectives
- Perform univariate analysis on each variable
- Explore bivariate relationships
- Conduct multivariate analysis to understand complex patterns


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from pathlib import Path

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
warnings.filterwarnings('ignore')

# Load data
project_root = Path().resolve().parent.parent.parent
data_path = project_root / "data" / "raw" / "Position_Salaries.csv"
df = pd.read_csv(data_path)

print("Data loaded successfully!")
print(f"Shape: {df.shape}")
df.head()


## 1. Univariate Analysis

Univariate analysis focuses on a single variable at a time.


In [None]:
# Univariate Analysis - Salary
print("=" * 60)
print("UNIVARIATE ANALYSIS - SALARY")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Histogram
axes[0, 0].hist(df['Salary'], bins=10, edgecolor='black', alpha=0.7, color='skyblue')
axes[0, 0].set_title('Histogram of Salary', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Salary', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].grid(True, alpha=0.3)

# 2. Box Plot
axes[0, 1].boxplot(df['Salary'], vert=True)
axes[0, 1].set_title('Box Plot of Salary', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Salary', fontsize=10)
axes[0, 1].grid(True, alpha=0.3)

# 3. Violin Plot
axes[0, 2].violinplot([df['Salary']], positions=[0], showmeans=True, showmedians=True)
axes[0, 2].set_title('Violin Plot of Salary', fontsize=12, fontweight='bold')
axes[0, 2].set_ylabel('Salary', fontsize=10)
axes[0, 2].grid(True, alpha=0.3)

# 4. Density Plot
df['Salary'].plot.density(ax=axes[1, 0], color='green', linewidth=2)
axes[1, 0].set_title('Density Plot of Salary', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Salary', fontsize=10)
axes[1, 0].set_ylabel('Density', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# 5. Q-Q Plot
stats.probplot(df['Salary'], dist="norm", plot=axes[1, 1])
axes[1, 1].set_title('Q-Q Plot of Salary', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

# 6. Summary Statistics
axes[1, 2].axis('off')
stats_text = f"""
Salary Statistics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Mean: ${df['Salary'].mean():,.2f}
Median: ${df['Salary'].median():,.2f}
Std Dev: ${df['Salary'].std():,.2f}
Min: ${df['Salary'].min():,.2f}
Max: ${df['Salary'].max():,.2f}
Range: ${df['Salary'].max() - df['Salary'].min():,.2f}
Skewness: {df['Salary'].skew():.4f}
Kurtosis: {df['Salary'].kurtosis():.4f}
"""
axes[1, 2].text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
                family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'univariate_salary.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Univariate Analysis - Level
print("=" * 60)
print("UNIVARIATE ANALYSIS - LEVEL")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Bar Chart
level_counts = df['Level'].value_counts().sort_index()
axes[0, 0].bar(level_counts.index, level_counts.values, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Frequency of Each Level', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Level', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].grid(True, alpha=0.3, axis='y')

# 2. Box Plot
axes[0, 1].boxplot(df['Level'], vert=True)
axes[0, 1].set_title('Box Plot of Level', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Level', fontsize=10)
axes[0, 1].grid(True, alpha=0.3)

# 3. Histogram
axes[0, 2].hist(df['Level'], bins=10, edgecolor='black', alpha=0.7, color='coral')
axes[0, 2].set_title('Histogram of Level', fontsize=12, fontweight='bold')
axes[0, 2].set_xlabel('Level', fontsize=10)
axes[0, 2].set_ylabel('Frequency', fontsize=10)
axes[0, 2].grid(True, alpha=0.3)

# 4. Line Plot
axes[1, 0].plot(df['Level'], marker='o', linewidth=2, markersize=8, color='green')
axes[1, 0].set_title('Level Progression', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Index', fontsize=10)
axes[1, 0].set_ylabel('Level', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# 5. Pie Chart (if applicable)
level_counts = df['Level'].value_counts().sort_index()
axes[1, 1].pie(level_counts.values, labels=level_counts.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Distribution of Levels', fontsize=12, fontweight='bold')

# 6. Summary Statistics
axes[1, 2].axis('off')
stats_text = f"""
Level Statistics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Mean: {df['Level'].mean():.2f}
Median: {df['Level'].median():.2f}
Std Dev: {df['Level'].std():.2f}
Min: {df['Level'].min()}
Max: {df['Level'].max()}
Range: {df['Level'].max() - df['Level'].min()}
Skewness: {df['Level'].skew():.4f}
Kurtosis: {df['Level'].kurtosis():.4f}
"""
axes[1, 2].text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
                family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'univariate_level.png', dpi=300, bbox_inches='tight')
plt.show()


## 2. Bivariate Analysis

Bivariate analysis explores the relationship between two variables.


In [None]:
# Bivariate Analysis - Level vs Salary
print("=" * 60)
print("BIVARIATE ANALYSIS - LEVEL vs SALARY")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Scatter Plot
axes[0, 0].scatter(df['Level'], df['Salary'], s=150, alpha=0.7, color='coral', edgecolors='black')
axes[0, 0].set_title('Scatter Plot: Level vs Salary', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Level', fontsize=10)
axes[0, 0].set_ylabel('Salary', fontsize=10)
axes[0, 0].grid(True, alpha=0.3)

# 2. Line Plot
axes[0, 1].plot(df['Level'], df['Salary'], marker='o', linewidth=2, markersize=8, color='green')
axes[0, 1].set_title('Line Plot: Level vs Salary', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Level', fontsize=10)
axes[0, 1].set_ylabel('Salary', fontsize=10)
axes[0, 1].grid(True, alpha=0.3)

# 3. Scatter with Regression Line
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(df['Level'], df['Salary'])
line = slope * df['Level'] + intercept
axes[0, 2].scatter(df['Level'], df['Salary'], s=150, alpha=0.7, color='purple', edgecolors='black')
axes[0, 2].plot(df['Level'], line, 'r--', linewidth=2, label=f'Linear Fit (r²={r_value**2:.4f})')
axes[0, 2].set_title('Scatter Plot with Regression Line', fontsize=12, fontweight='bold')
axes[0, 2].set_xlabel('Level', fontsize=10)
axes[0, 2].set_ylabel('Salary', fontsize=10)
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# 4. Bar Chart - Salary by Level
axes[1, 0].bar(df['Level'], df['Salary'], color='steelblue', alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Bar Chart: Salary by Level', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Level', fontsize=10)
axes[1, 0].set_ylabel('Salary', fontsize=10)
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 5. Residual Plot
residuals = df['Salary'] - (slope * df['Level'] + intercept)
axes[1, 1].scatter(df['Level'], residuals, s=100, alpha=0.7, color='orange', edgecolors='black')
axes[1, 1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 1].set_title('Residual Plot', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Level', fontsize=10)
axes[1, 1].set_ylabel('Residuals', fontsize=10)
axes[1, 1].grid(True, alpha=0.3)

# 6. Correlation Statistics
axes[1, 2].axis('off')
from scipy.stats import pearsonr, spearmanr
pearson_corr, pearson_p = pearsonr(df['Level'], df['Salary'])
spearman_corr, spearman_p = spearmanr(df['Level'], df['Salary'])

stats_text = f"""
Bivariate Statistics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
Pearson Correlation: {pearson_corr:.4f}
Pearson p-value: {pearson_p:.6f}
Spearman Correlation: {spearman_corr:.4f}
Spearman p-value: {spearman_p:.6f}
R² (Coefficient of Determination): {r_value**2:.4f}
Slope: {slope:.2f}
Intercept: ${intercept:,.2f}
"""
axes[1, 2].text(0.1, 0.5, stats_text, fontsize=11, verticalalignment='center',
                family='monospace', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'bivariate_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Create additional features for multivariate analysis
df_analysis = df.copy()

# Create salary categories
df_analysis['Salary_Category'] = pd.cut(df_analysis['Salary'], 
                                         bins=[0, 80000, 150000, 300000, float('inf')],
                                         labels=['Low', 'Medium', 'High', 'Very High'])

# Create level groups
df_analysis['Level_Group'] = pd.cut(df_analysis['Level'], 
                                     bins=[0, 3, 6, 10],
                                     labels=['Junior', 'Mid', 'Senior'])

# Create salary growth rate
df_analysis['Salary_Growth'] = df_analysis['Salary'].pct_change() * 100

# Create log transformations
df_analysis['Log_Salary'] = np.log(df_analysis['Salary'])
df_analysis['Log_Level'] = np.log(df_analysis['Level'])

print("Additional features created for multivariate analysis")
df_analysis.head()


In [None]:
# Multivariate Analysis Visualizations
fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# 1. Correlation Heatmap (Extended)
numeric_cols = df_analysis.select_dtypes(include=[np.number]).columns
correlation_matrix = df_analysis[numeric_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=2, ax=axes[0, 0], cbar_kws={"shrink": 0.8})
axes[0, 0].set_title('Correlation Heatmap (Multivariate)', fontsize=12, fontweight='bold')

# 2. Salary by Level Group
level_group_salary = df_analysis.groupby('Level_Group')['Salary'].mean()
axes[0, 1].bar(level_group_salary.index, level_group_salary.values, 
               color=['lightblue', 'steelblue', 'darkblue'], alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Average Salary by Level Group', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Level Group', fontsize=10)
axes[0, 1].set_ylabel('Average Salary', fontsize=10)
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Salary Category Distribution
salary_cat_counts = df_analysis['Salary_Category'].value_counts()
axes[0, 2].pie(salary_cat_counts.values, labels=salary_cat_counts.index, autopct='%1.1f%%', 
               startangle=90, colors=['lightcoral', 'lightsalmon', 'orange', 'gold'])
axes[0, 2].set_title('Salary Category Distribution', fontsize=12, fontweight='bold')

# 4. Level vs Log Salary
axes[1, 0].scatter(df_analysis['Level'], df_analysis['Log_Salary'], s=150, alpha=0.7, 
                   color='purple', edgecolors='black')
z = np.polyfit(df_analysis['Level'], df_analysis['Log_Salary'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df_analysis['Level'], p(df_analysis['Level']), "r--", linewidth=2)
axes[1, 0].set_title('Level vs Log(Salary)', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Level', fontsize=10)
axes[1, 0].set_ylabel('Log(Salary)', fontsize=10)
axes[1, 0].grid(True, alpha=0.3)

# 5. Salary Growth Rate
axes[1, 1].bar(df_analysis['Level'][1:], df_analysis['Salary_Growth'][1:], 
               color='orange', alpha=0.7, edgecolor='black')
axes[1, 1].set_title('Salary Growth Rate by Level', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Level', fontsize=10)
axes[1, 1].set_ylabel('Growth Rate (%)', fontsize=10)
axes[1, 1].grid(True, alpha=0.3, axis='y')

# 6. Pair Plot (selected variables)
from pandas.plotting import scatter_matrix
scatter_vars = df_analysis[['Level', 'Salary', 'Log_Salary']]
pd.plotting.scatter_matrix(scatter_vars, alpha=0.7, figsize=(8, 8), ax=axes[1, 2], 
                           diagonal='hist', s=100)
axes[1, 2].set_title('Pair Plot: Level, Salary, Log(Salary)', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'multivariate_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Summary of Analysis
print("=" * 60)
print("ANALYSIS SUMMARY")
print("=" * 60)

print("\n1. UNIVARIATE ANALYSIS:")
print(f"   - Salary: Mean=${df['Salary'].mean():,.2f}, Std=${df['Salary'].std():,.2f}")
print(f"   - Level: Mean={df['Level'].mean():.2f}, Std={df['Level'].std():.2f}")

print("\n2. BIVARIATE ANALYSIS:")
print(f"   - Pearson Correlation: {pearson_corr:.4f} (p={pearson_p:.6f})")
print(f"   - Spearman Correlation: {spearman_corr:.4f} (p={spearman_p:.6f})")
print(f"   - R²: {r_value**2:.4f}")

print("\n3. MULTIVARIATE ANALYSIS:")
print(f"   - Level Groups: {df_analysis['Level_Group'].value_counts().to_dict()}")
print(f"   - Salary Categories: {df_analysis['Salary_Category'].value_counts().to_dict()}")
print(f"   - Average Salary Growth Rate: {df_analysis['Salary_Growth'][1:].mean():.2f}%")

# Save analysis data
analysis_path = project_root / 'data' / 'processed' / 'analysis_data.csv'
df_analysis.to_csv(analysis_path, index=False)
print(f"\nAnalysis data saved to: {analysis_path}")
