# Exploratory Data Analysis (EDA) - Position Salaries Dataset

This notebook contains comprehensive exploratory data analysis of the Position Salaries dataset.

## Objectives
1. Load and inspect the dataset
2. Understand data structure and types
3. Identify missing values and outliers
4. Explore distributions and relationships
5. Generate visualizations
6. Prepare data for further analysis


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import os

# Set style and warnings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
warnings.filterwarnings('ignore')

# Set working directory
project_root = Path().resolve().parent.parent.parent
data_path = project_root / "data" / "raw" / "Position_Salaries.csv"

print(f"Project root: {project_root}")
print(f"Data path: {data_path}")
print(f"Data file exists: {data_path.exists()}")


In [None]:
# Load the dataset
df = pd.read_csv(data_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head(10))
print("\nDataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())


In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print("\nMissing Values Percentage:")
print((df.isnull().sum() / len(df)) * 100)

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Check data types
print("\nData Types:")
print(df.dtypes)


In [None]:
# Statistical Summary
print("Statistical Summary:")
print(df.describe(include='all'))

# Unique values
print("\nUnique Positions:")
print(df['Position'].unique())
print(f"\nNumber of unique positions: {df['Position'].nunique()}")
print(f"Number of unique levels: {df['Level'].nunique()}")


In [None]:
# Distribution of Salary
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.hist(df['Salary'], bins=10, edgecolor='black', alpha=0.7, color='skyblue')
plt.title('Distribution of Salary', fontsize=14, fontweight='bold')
plt.xlabel('Salary', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(df['Salary'], vert=True)
plt.title('Box Plot of Salary', fontsize=14, fontweight='bold')
plt.ylabel('Salary', fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'salary_distribution.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Relationship between Level and Salary
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(df['Level'], df['Salary'], s=100, alpha=0.7, color='coral', edgecolors='black')
plt.title('Level vs Salary', fontsize=14, fontweight='bold')
plt.xlabel('Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(df['Level'], df['Salary'], marker='o', linewidth=2, markersize=8, color='green')
plt.title('Level vs Salary (Line Plot)', fontsize=14, fontweight='bold')
plt.xlabel('Level', fontsize=12)
plt.ylabel('Salary', fontsize=12)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'level_vs_salary.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Salary by Position
plt.figure(figsize=(14, 8))
df_sorted = df.sort_values('Salary')
plt.barh(df_sorted['Position'], df_sorted['Salary'], color='steelblue', alpha=0.8)
plt.title('Salary by Position', fontsize=16, fontweight='bold')
plt.xlabel('Salary ($)', fontsize=12)
plt.ylabel('Position', fontsize=12)
plt.grid(True, alpha=0.3, axis='x')

# Add value labels
for i, v in enumerate(df_sorted['Salary']):
    plt.text(v + 10000, i, f'${v:,}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'salary_by_position.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Correlation analysis
correlation = df[['Level', 'Salary']].corr()
print("Correlation Matrix:")
print(correlation)

plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=2, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap: Level vs Salary', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Level distribution
axes[0, 0].bar(df['Level'], df['Salary'], color='lightblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Salary by Level (Bar Chart)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Level', fontsize=10)
axes[0, 0].set_ylabel('Salary', fontsize=10)
axes[0, 0].grid(True, alpha=0.3)

# 2. Salary distribution (density)
axes[0, 1].hist(df['Salary'], bins=10, density=True, alpha=0.7, color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Salary Distribution (Density)', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Salary', fontsize=10)
axes[0, 1].set_ylabel('Density', fontsize=10)
axes[0, 1].grid(True, alpha=0.3)

# 3. Level vs Salary with trend line
axes[1, 0].scatter(df['Level'], df['Salary'], s=150, alpha=0.6, color='purple', edgecolors='black')
z = np.polyfit(df['Level'], df['Salary'], 1)
p = np.poly1d(z)
axes[1, 0].plot(df['Level'], p(df['Level']), "r--", alpha=0.8, linewidth=2, label='Trend Line')
axes[1, 0].set_title('Level vs Salary with Trend Line', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Level', fontsize=10)
axes[1, 0].set_ylabel('Salary', fontsize=10)
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 4. Salary growth rate
df['Salary_Growth'] = df['Salary'].pct_change() * 100
axes[1, 1].bar(df['Level'][1:], df['Salary_Growth'][1:], color='orange', alpha=0.7, edgecolor='black')
axes[1, 1].set_title('Salary Growth Rate by Level', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Level', fontsize=10)
axes[1, 1].set_ylabel('Growth Rate (%)', fontsize=10)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(project_root / 'results' / 'figures' / 'comprehensive_analysis.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Summary Statistics
print("=" * 50)
print("EDA SUMMARY")
print("=" * 50)
print(f"\nDataset Size: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"\nSalary Statistics:")
print(f"  Mean: ${df['Salary'].mean():,.2f}")
print(f"  Median: ${df['Salary'].median():,.2f}")
print(f"  Std Dev: ${df['Salary'].std():,.2f}")
print(f"  Min: ${df['Salary'].min():,.2f}")
print(f"  Max: ${df['Salary'].max():,.2f}")
print(f"  Range: ${df['Salary'].max() - df['Salary'].min():,.2f}")

print(f"\nLevel Statistics:")
print(f"  Mean: {df['Level'].mean():.2f}")
print(f"  Median: {df['Level'].median():.2f}")
print(f"  Min: {df['Level'].min()}")
print(f"  Max: {df['Level'].max()}")

print(f"\nCorrelation (Level vs Salary): {df['Level'].corr(df['Salary']):.4f}")

# Save processed data
processed_path = project_root / 'data' / 'processed' / 'processed_data.csv'
df.to_csv(processed_path, index=False)
print(f"\nProcessed data saved to: {processed_path}")
