# Employee Salary Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on employee salary data to understand patterns, relationships, and insights that will guide our machine learning model development.

## Objectives:
- Understand the dataset structure and characteristics
- Identify patterns and trends in employee salaries
- Explore relationships between different features
- Detect potential issues (outliers, missing values, data quality)
- Generate insights to inform feature engineering and model selection

## 1. Install and Import Required Libraries

In [None]:
# Install required packages if not already installed
import sys
import subprocess

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ {package} installed successfully")
    except subprocess.CalledProcessError:
        print(f"✗ Failed to install {package}")

# List of required packages
required_packages = [
    'pandas', 'numpy', 'matplotlib', 'seaborn', 'plotly', 
    'scikit-learn', 'jupyter', 'joblib'
]

# Install packages
for package in required_packages:
    try:
        __import__(package)
        print(f"✓ {package} already installed")
    except ImportError:
        install_package(package)

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import sys

# Add src directory to path for importing custom modules
sys.path.append('../src')

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Configure matplotlib
plt.style.use('seaborn-v0_8')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Configure seaborn
sns.set_palette("husl")

# Suppress warnings
warnings.filterwarnings('ignore')

print("✓ All libraries imported successfully!")
print("✓ Configuration complete!")

## 2. Set Up Project Directory Structure and Generate Sample Data

First, let's ensure our project structure is properly set up and generate sample data if it doesn't exist.

In [None]:
# Create project directory structure
import os

# Define project directories
directories = [
    '../data',
    '../models',
    '../src',
    '../notebooks'
]

# Create directories if they don't exist
for directory in directories:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"✓ Created directory: {directory}")
    else:
        print(f"✓ Directory already exists: {directory}")

# Generate sample data if it doesn't exist
data_file = '../data/employee_salary_data.csv'

if not os.path.exists(data_file):
    print("\n📊 Generating sample employee salary data...")
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Define possible values for categorical variables
    departments = ['Engineering', 'Sales', 'Marketing', 'HR', 'Finance', 'Operations', 'IT']
    job_titles = ['Junior', 'Mid-Level', 'Senior', 'Lead', 'Manager', 'Director']
    education_levels = ['Bachelor', 'Master', 'PhD', 'High School']
    locations = ['New York', 'San Francisco', 'Chicago', 'Austin', 'Boston', 'Seattle', 'Remote']
    
    # Generate sample data
    n_samples = 1000
    
    data = {
        'employee_id': range(1, n_samples + 1),
        'age': np.random.randint(22, 65, n_samples),
        'years_experience': np.random.randint(0, 35, n_samples),
        'department': np.random.choice(departments, n_samples),
        'job_title': np.random.choice(job_titles, n_samples),
        'education_level': np.random.choice(education_levels, n_samples),
        'location': np.random.choice(locations, n_samples),
        'performance_rating': np.random.uniform(1, 5, n_samples).round(2),
        'overtime_hours': np.random.randint(0, 20, n_samples),
        'projects_completed': np.random.randint(0, 15, n_samples),
    }
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Create realistic salary based on multiple factors
    base_salary = 40000
    
    # Department multiplier
    dept_multiplier = {
        'Engineering': 1.4, 'IT': 1.3, 'Finance': 1.2, 
        'Sales': 1.1, 'Marketing': 1.0, 'HR': 0.9, 'Operations': 0.8
    }
    
    # Job title multiplier
    title_multiplier = {
        'Junior': 1.0, 'Mid-Level': 1.3, 'Senior': 1.6, 
        'Lead': 1.9, 'Manager': 2.2, 'Director': 2.8
    }
    
    # Education multiplier
    edu_multiplier = {
        'High School': 1.0, 'Bachelor': 1.2, 'Master': 1.4, 'PhD': 1.6
    }
    
    # Location multiplier
    loc_multiplier = {
        'New York': 1.3, 'San Francisco': 1.4, 'Seattle': 1.2, 
        'Boston': 1.2, 'Chicago': 1.1, 'Austin': 1.0, 'Remote': 0.9
    }
    
    # Calculate salary with multiple factors
    df['salary'] = (
        base_salary * 
        df['department'].map(dept_multiplier) *
        df['job_title'].map(title_multiplier) *
        df['education_level'].map(edu_multiplier) *
        df['location'].map(loc_multiplier) *
        (1 + df['years_experience'] * 0.03) *
        (1 + df['performance_rating'] * 0.1) *
        (1 + df['overtime_hours'] * 0.02) *
        (1 + df['projects_completed'] * 0.05)
    ).round(0).astype(int)
    
    # Add some random noise to make it more realistic
    df['salary'] += np.random.normal(0, 5000, n_samples).astype(int)
    
    # Ensure minimum salary
    df['salary'] = np.maximum(df['salary'], 35000)
    
    # Save to CSV
    df.to_csv(data_file, index=False)
    print(f"✓ Sample data generated and saved to {data_file}")
    print(f"✓ Dataset shape: {df.shape}")
else:
    print(f"✓ Data file already exists: {data_file}")

print("\n🎉 Project setup complete!")

## 3. Load and Explore the Dataset

Let's load the employee salary dataset and perform initial exploration to understand its structure and characteristics.

In [None]:
# Load the dataset
df = pd.read_csv('../data/employee_salary_data.csv')

print("📊 DATASET OVERVIEW")
print("=" * 50)
print(f"Dataset Shape: {df.shape}")
print(f"Number of Rows: {df.shape[0]:,}")
print(f"Number of Columns: {df.shape[1]}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")

print("\n📋 COLUMN INFORMATION")
print("=" * 50)
for i, col in enumerate(df.columns, 1):
    dtype = df[col].dtype
    unique_count = df[col].nunique()
    null_count = df[col].isnull().sum()
    print(f"{i:2d}. {col:<20} | {str(dtype):<10} | Unique: {unique_count:4d} | Nulls: {null_count:3d}")

print("\n📈 FIRST 5 ROWS")
print("=" * 50)
df.head()

In [None]:
# Statistical summary of numerical columns
print("📊 STATISTICAL SUMMARY - NUMERICAL COLUMNS")
print("=" * 80)
numerical_summary = df.describe()
print(numerical_summary)

print("\n📊 STATISTICAL SUMMARY - CATEGORICAL COLUMNS")
print("=" * 80)
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Most common: {df[col].value_counts().head(3).to_dict()}")

print("\n🔍 DATA QUALITY CHECK")
print("=" * 50)
print("Missing Values:")
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("✓ No missing values found!")
else:
    print(missing_values[missing_values > 0])

print("\nDuplicate Rows:")
duplicate_count = df.duplicated().sum()
if duplicate_count == 0:
    print("✓ No duplicate rows found!")
else:
    print(f"⚠️  {duplicate_count} duplicate rows found")

print("\nData Types:")
print(df.dtypes)

## 4. Data Visualization and Distribution Analysis

Let's create comprehensive visualizations to understand the distribution of our data and identify patterns.

In [None]:
# Salary Distribution Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Salary Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Histogram of Salary
axes[0,0].hist(df['salary'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Salary Distribution')
axes[0,0].set_xlabel('Salary ($)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['salary'].mean(), color='red', linestyle='--', label=f'Mean: ${df["salary"].mean():,.0f}')
axes[0,0].axvline(df['salary'].median(), color='green', linestyle='--', label=f'Median: ${df["salary"].median():,.0f}')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Box Plot of Salary
axes[0,1].boxplot(df['salary'])
axes[0,1].set_title('Salary Box Plot')
axes[0,1].set_ylabel('Salary ($)')
axes[0,1].grid(True, alpha=0.3)

# 3. Age Distribution
axes[1,0].hist(df['age'], bins=20, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,0].set_title('Age Distribution')
axes[1,0].set_xlabel('Age')
axes[1,0].set_ylabel('Frequency')
axes[1,0].axvline(df['age'].mean(), color='red', linestyle='--', label=f'Mean: {df["age"].mean():.1f}')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# 4. Years of Experience Distribution
axes[1,1].hist(df['years_experience'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1,1].set_title('Years of Experience Distribution')
axes[1,1].set_xlabel('Years of Experience')
axes[1,1].set_ylabel('Frequency')
axes[1,1].axvline(df['years_experience'].mean(), color='red', linestyle='--', label=f'Mean: {df["years_experience"].mean():.1f}')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print salary statistics
print("💰 SALARY STATISTICS")
print("=" * 40)
print(f"Mean Salary:     ${df['salary'].mean():,.2f}")
print(f"Median Salary:   ${df['salary'].median():,.2f}")
print(f"Min Salary:      ${df['salary'].min():,.2f}")
print(f"Max Salary:      ${df['salary'].max():,.2f}")
print(f"Std Deviation:   ${df['salary'].std():,.2f}")
print(f"25th Percentile: ${df['salary'].quantile(0.25):,.2f}")
print(f"75th Percentile: ${df['salary'].quantile(0.75):,.2f}")
print(f"IQR:             ${df['salary'].quantile(0.75) - df['salary'].quantile(0.25):,.2f}")

In [None]:
# Categorical Variables Analysis
categorical_cols = ['department', 'job_title', 'education_level', 'location']

fig, axes = plt.subplots(2, 2, figsize=(20, 15))
fig.suptitle('Categorical Variables Distribution', fontsize=16, fontweight='bold')

for i, col in enumerate(categorical_cols):
    row = i // 2
    col_idx = i % 2
    
    # Count plot
    value_counts = df[col].value_counts()
    axes[row, col_idx].bar(value_counts.index, value_counts.values, alpha=0.7)
    axes[row, col_idx].set_title(f'{col.replace("_", " ").title()} Distribution')
    axes[row, col_idx].set_ylabel('Count')
    axes[row, col_idx].tick_params(axis='x', rotation=45)
    axes[row, col_idx].grid(True, alpha=0.3)
    
    # Add value labels on bars
    for j, v in enumerate(value_counts.values):
        axes[row, col_idx].text(j, v + 5, str(v), ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print categorical statistics
print("📊 CATEGORICAL VARIABLES SUMMARY")
print("=" * 60)
for col in categorical_cols:
    print(f"\n{col.upper().replace('_', ' ')}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Most common: {df[col].mode().iloc[0]} ({df[col].value_counts().iloc[0]} occurrences)")
    print(f"  Distribution: {dict(df[col].value_counts().head(3))}")

## 5. Correlation Analysis and Feature Relationships

Let's analyze the relationships between different features and their correlation with salary.

In [None]:
# Correlation Matrix for Numerical Variables
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, 
            annot=True, 
            cmap='coolwarm', 
            center=0,
            square=True, 
            mask=mask,
            fmt='.2f', 
            cbar_kws={"shrink": .8})
plt.title('Correlation Matrix - Numerical Variables', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print strongest correlations with salary
print("🔗 STRONGEST CORRELATIONS WITH SALARY")
print("=" * 50)
salary_corr = correlation_matrix['salary'].abs().sort_values(ascending=False)[1:]  # Exclude self-correlation
for feature, corr in salary_corr.items():
    print(f"{feature:<20}: {corr:.3f}")

# Scatter plots for key relationships
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Key Relationships with Salary', fontsize=16, fontweight='bold')

# 1. Years Experience vs Salary
axes[0,0].scatter(df['years_experience'], df['salary'], alpha=0.6, color='blue')
axes[0,0].set_xlabel('Years of Experience')
axes[0,0].set_ylabel('Salary ($)')
axes[0,0].set_title('Experience vs Salary')
axes[0,0].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df['years_experience'], df['salary'], 1)
p = np.poly1d(z)
axes[0,0].plot(df['years_experience'], p(df['years_experience']), "r--", alpha=0.8)

# 2. Age vs Salary
axes[0,1].scatter(df['age'], df['salary'], alpha=0.6, color='green')
axes[0,1].set_xlabel('Age')
axes[0,1].set_ylabel('Salary ($)')
axes[0,1].set_title('Age vs Salary')
axes[0,1].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df['age'], df['salary'], 1)
p = np.poly1d(z)
axes[0,1].plot(df['age'], p(df['age']), "r--", alpha=0.8)

# 3. Performance Rating vs Salary
axes[1,0].scatter(df['performance_rating'], df['salary'], alpha=0.6, color='orange')
axes[1,0].set_xlabel('Performance Rating')
axes[1,0].set_ylabel('Salary ($)')
axes[1,0].set_title('Performance Rating vs Salary')
axes[1,0].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df['performance_rating'], df['salary'], 1)
p = np.poly1d(z)
axes[1,0].plot(df['performance_rating'], p(df['performance_rating']), "r--", alpha=0.8)

# 4. Projects Completed vs Salary
axes[1,1].scatter(df['projects_completed'], df['salary'], alpha=0.6, color='purple')
axes[1,1].set_xlabel('Projects Completed')
axes[1,1].set_ylabel('Salary ($)')
axes[1,1].set_title('Projects Completed vs Salary')
axes[1,1].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df['projects_completed'], df['salary'], 1)
p = np.poly1d(z)
axes[1,1].plot(df['projects_completed'], p(df['projects_completed']), "r--", alpha=0.8)

plt.tight_layout()
plt.show()

In [None]:
# Salary Analysis by Categorical Variables
fig, axes = plt.subplots(2, 2, figsize=(18, 15))
fig.suptitle('Salary Analysis by Categorical Variables', fontsize=16, fontweight='bold')

categorical_cols = ['department', 'job_title', 'education_level', 'location']

for i, col in enumerate(categorical_cols):
    row = i // 2
    col_idx = i % 2
    
    # Calculate mean salary by category
    salary_by_category = df.groupby(col)['salary'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)
    
    # Create box plot
    df.boxplot(column='salary', by=col, ax=axes[row, col_idx])
    axes[row, col_idx].set_title(f'Salary by {col.replace("_", " ").title()}')
    axes[row, col_idx].set_xlabel(col.replace("_", " ").title())
    axes[row, col_idx].set_ylabel('Salary ($)')
    axes[row, col_idx].tick_params(axis='x', rotation=45)
    axes[row, col_idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print detailed salary statistics by category
print("💰 SALARY ANALYSIS BY CATEGORICAL VARIABLES")
print("=" * 80)

for col in categorical_cols:
    print(f"\n{col.upper().replace('_', ' ')}:")
    print("-" * 50)
    
    salary_stats = df.groupby(col)['salary'].agg(['mean', 'median', 'std', 'count']).round(2)
    salary_stats = salary_stats.sort_values('mean', ascending=False)
    
    print(f"{'Category':<15} | {'Mean':<10} | {'Median':<10} | {'Std':<10} | {'Count':<8}")
    print("-" * 70)
    
    for category, stats in salary_stats.iterrows():
        print(f"{category:<15} | ${stats['mean']:>8,.0f} | ${stats['median']:>8,.0f} | ${stats['std']:>8,.0f} | {stats['count']:>5.0f}")
    
    # Calculate the range (difference between highest and lowest mean)
    salary_range = salary_stats['mean'].max() - salary_stats['mean'].min()
    print(f"\nSalary Range: ${salary_range:,.0f} (difference between highest and lowest mean)")
    
    # Show percentage difference
    pct_diff = ((salary_stats['mean'].max() - salary_stats['mean'].min()) / salary_stats['mean'].min()) * 100
    print(f"Percentage Difference: {pct_diff:.1f}%")

## 6. Outlier Detection and Data Quality Assessment

Let's identify potential outliers and assess the overall data quality for our machine learning models.

In [None]:
# Comprehensive Salary Analysis with Visualizations
import scipy.stats as stats

# Create a figure with multiple subplots for salary analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🔍 Comprehensive Salary Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Histogram with statistics
axes[0,0].hist(df['salary'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Salary Distribution')
axes[0,0].set_xlabel('Salary ($)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].axvline(df['salary'].mean(), color='red', linestyle='--', label=f'Mean: ${df["salary"].mean():,.0f}')
axes[0,0].axvline(df['salary'].median(), color='green', linestyle='--', label=f'Median: ${df["salary"].median():,.0f}')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Box plot for outlier detection
axes[0,1].boxplot(df['salary'])
axes[0,1].set_title('Salary Box Plot (Outlier Detection)')
axes[0,1].set_ylabel('Salary ($)')
axes[0,1].grid(True, alpha=0.3)

# 3. Q-Q plot for normality test
stats.probplot(df['salary'], dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot (Normality Test)')
axes[1,0].grid(True, alpha=0.3)

# 4. Salary by Department
df.boxplot(column='salary', by='department', ax=axes[1,1])
axes[1,1].set_title('Salary by Department')
axes[1,1].set_xlabel('Department')
axes[1,1].set_ylabel('Salary ($)')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Print detailed salary statistics
print("📊 DETAILED SALARY STATISTICS")
print("=" * 60)
print(f"Mean Salary: ${df['salary'].mean():,.2f}")
print(f"Median Salary: ${df['salary'].median():,.2f}")
print(f"Standard Deviation: ${df['salary'].std():,.2f}")
print(f"Minimum Salary: ${df['salary'].min():,.2f}")
print(f"Maximum Salary: ${df['salary'].max():,.2f}")
print(f"Range: ${df['salary'].max() - df['salary'].min():,.2f}")
print(f"Coefficient of Variation: {df['salary'].std() / df['salary'].mean():.3f}")
print(f"Skewness: {df['salary'].skew():.3f}")
print(f"Kurtosis: {df['salary'].kurtosis():.3f}")

# Percentile analysis
percentiles = [10, 25, 50, 75, 90, 95, 99]
print(f"\n📈 PERCENTILE ANALYSIS")
print("=" * 40)
for p in percentiles:
    value = np.percentile(df['salary'], p)
    print(f"  {p:2d}th percentile: ${value:8,.0f}")

# Outlier detection using IQR method
Q1 = df['salary'].quantile(0.25)
Q3 = df['salary'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df['salary'] < lower_bound) | (df['salary'] > upper_bound)]

print(f"\n🚨 OUTLIER DETECTION (IQR Method)")
print("=" * 40)
print(f"Lower Bound: ${lower_bound:,.2f}")
print(f"Upper Bound: ${upper_bound:,.2f}")
print(f"Number of Outliers: {len(outliers)}")
print(f"Percentage of Outliers: {len(outliers)/len(df)*100:.2f}%")

In [None]:
# Correlation Analysis
print("🔗 CORRELATION ANALYSIS")
print("=" * 60)

# Create correlation matrix for numerical features
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('🔥 Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Print correlation with salary
salary_corr = correlation_matrix['salary'].sort_values(ascending=False)
print("\n📊 CORRELATION WITH SALARY")
print("=" * 40)
for feature, corr_value in salary_corr.items():
    if feature != 'salary':
        strength = "Very Strong" if abs(corr_value) > 0.8 else "Strong" if abs(corr_value) > 0.6 else "Moderate" if abs(corr_value) > 0.4 else "Weak"
        direction = "Positive" if corr_value > 0 else "Negative"
        print(f"{feature:<20}: {corr_value:6.3f} ({strength} {direction})")

# Summary and Key Insights
print("\n" + "="*80)
print("🎯 KEY INSIGHTS FROM EXPLORATORY DATA ANALYSIS")
print("="*80)

insights = []

# Salary insights
avg_salary = df['salary'].mean()
insights.append(f"• Average salary across all employees: ${avg_salary:,.0f}")

# Department insights
highest_dept = dept_salary.index[0]
lowest_dept = dept_salary.index[-1]
insights.append(f"• Highest paying department: {highest_dept} (${dept_salary.loc[highest_dept, 'mean']:,.0f})")
insights.append(f"• Lowest paying department: {lowest_dept} (${dept_salary.loc[lowest_dept, 'mean']:,.0f})")

# Experience insights
exp_corr = correlation_matrix.loc['years_experience', 'salary']
insights.append(f"• Years of experience shows {exp_corr:.3f} correlation with salary")

# Performance insights
perf_corr = correlation_matrix.loc['performance_rating', 'salary']
insights.append(f"• Performance rating shows {perf_corr:.3f} correlation with salary")

# Age insights
age_corr = correlation_matrix.loc['age', 'salary']
insights.append(f"• Age shows {age_corr:.3f} correlation with salary")

# Education insights
phd_avg = edu_salary.loc['PhD', 'mean'] if 'PhD' in edu_salary.index else 0
hs_avg = edu_salary.loc['High School', 'mean'] if 'High School' in edu_salary.index else 0
if phd_avg > 0 and hs_avg > 0:
    edu_premium = ((phd_avg - hs_avg) / hs_avg) * 100
    insights.append(f"• PhD holders earn {edu_premium:.1f}% more than High School graduates")

# Location insights
sf_avg = loc_salary.loc['San Francisco', 'mean'] if 'San Francisco' in loc_salary.index else 0
remote_avg = loc_salary.loc['Remote', 'mean'] if 'Remote' in loc_salary.index else 0
if sf_avg > 0 and remote_avg > 0:
    location_premium = ((sf_avg - remote_avg) / remote_avg) * 100
    insights.append(f"• San Francisco employees earn {location_premium:.1f}% more than remote workers")

# Print insights
for insight in insights:
    print(insight)

print("\n" + "="*80)
print("📝 RECOMMENDATIONS FOR MODEL DEVELOPMENT")
print("="*80)

recommendations = [
    "• Include all categorical variables (department, job_title, education, location) as features",
    "• Years of experience is a strong predictor - consider feature engineering",
    "• Performance rating shows good correlation - keep as important feature",
    "• Consider creating interaction features between experience and education",
    "• Department and job title may benefit from target encoding",
    "• Location premium suggests geographical factors are important",
    "• Consider polynomial features for experience and age",
    "• Salary distribution appears normal - linear models may work well",
    "• Low number of outliers - data quality is good"
]

for rec in recommendations:
    print(rec)

print("\n🎉 EXPLORATORY DATA ANALYSIS COMPLETE!")