# Comprehensive Exploratory Data Analysis
## PES University Placement Data (2022-2026)

**Project:** Temporal and Statistical Data Driven Insights into Talent Acquisition

**Objective:** Perform comprehensive EDA to understand placement patterns, compensation trends, and identify key insights for students.

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import json

warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 7)
plt.rcParams['font.size'] = 10

# Create output directory
Path('analysis_outputs').mkdir(exist_ok=True)

print("‚úì Libraries imported successfully")

‚úì Libraries imported successfully


## 1. Data Loading and Initial Inspection

In [None]:
# Load the consolidated dataset
df = pd.read_csv('processed_data/consolidated_placement_data.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nTotal Records: {len(df):,}")
print(f"Total Features: {len(df.columns)}")
print(f"\nDate Range: {df['batch_year'].min()} - {df['batch_year'].max()}")
print(f"Unique Companies: {df['company_name'].nunique():,}")
print(f"Unique Colleges: {df['college'].nunique()}")

df.head()

In [None]:
# Dataset information
print("Dataset Information:")
print("=" * 80)
df.info()

In [None]:
# Statistical summary
df.describe(include='all').T

## 2. Data Quality Assessment

In [None]:
# Missing value analysis
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).round(2),
    'Data_Type': df.dtypes
}).sort_values('Missing_Percentage', ascending=False)

print("Missing Value Analysis:")
print("=" * 80)
missing_data[missing_data['Missing_Count'] > 0]

In [None]:
# Visualize missing data
plt.figure(figsize=(14, 8))
missing_cols = missing_data[missing_data['Missing_Percentage'] > 0].head(15)
sns.barplot(data=missing_cols, y='Column', x='Missing_Percentage', palette='viridis')
plt.title('Top 15 Columns with Missing Data', fontsize=14, fontweight='bold')
plt.xlabel('Missing Percentage (%)')
plt.ylabel('Column Name')
plt.tight_layout()
plt.savefig('analysis_outputs/missing_data_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Temporal Analysis: Year-wise Distribution

In [None]:
# Records by year
year_dist = df['batch_year'].value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot
axes[0].bar(year_dist.index, year_dist.values, color='steelblue', edgecolor='black')
axes[0].set_xlabel('Batch Year', fontsize=12)
axes[0].set_ylabel('Number of Records', fontsize=12)
axes[0].set_title('Placement Records by Year', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(year_dist.values):
    axes[0].text(year_dist.index[i], v + 20, str(v), ha='center', fontweight='bold')

# Pie chart
colors = plt.cm.Set3(range(len(year_dist)))
axes[1].pie(year_dist.values, labels=year_dist.index, autopct='%1.1f%%', 
            colors=colors, startangle=90, textprops={'fontsize': 11})
axes[1].set_title('Year-wise Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('analysis_outputs/yearly_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nYear-wise Statistics:")
for year, count in year_dist.items():
    pct = count / len(df) * 100
    print(f"  {year}: {count:4d} records ({pct:5.1f}%)")

## 4. Placement Tier Analysis

In [None]:
# Tier distribution
tier_dist = df['placement_tier'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Horizontal bar chart
tier_dist.plot(kind='barh', ax=axes[0], color='coral', edgecolor='black')
axes[0].set_xlabel('Number of Records', fontsize=12)
axes[0].set_ylabel('Placement Tier', fontsize=12)
axes[0].set_title('Placement Distribution by Tier', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Pie chart
axes[1].pie(tier_dist.values, labels=tier_dist.index, autopct='%1.1f%%', 
            startangle=45, textprops={'fontsize': 9})
axes[1].set_title('Tier Distribution Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('analysis_outputs/tier_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTier-wise Statistics:")
for tier, count in tier_dist.items():
    pct = count / len(df) * 100
    print(f"  {tier:25s}: {count:4d} records ({pct:5.1f}%)")

## 5. Compensation Analysis

In [None]:
# Filter valid CTC records
df_ctc = df[df['total_ctc'].notna() & (df['total_ctc'] > 0)].copy()

print(f"Records with CTC data: {len(df_ctc):,} ({len(df_ctc)/len(df)*100:.1f}%)")
print("\nCTC Statistics (LPA):")
print("=" * 60)
print(f"Mean:        ‚Çπ{df_ctc['total_ctc'].mean():8.2f}")
print(f"Median:      ‚Çπ{df_ctc['total_ctc'].median():8.2f}")
print(f"Std Dev:     ‚Çπ{df_ctc['total_ctc'].std():8.2f}")
print(f"Min:         ‚Çπ{df_ctc['total_ctc'].min():8.2f}")
print(f"Max:         ‚Çπ{df_ctc['total_ctc'].max():8.2f}")
print(f"\n25th %ile:   ‚Çπ{df_ctc['total_ctc'].quantile(0.25):8.2f}")
print(f"50th %ile:   ‚Çπ{df_ctc['total_ctc'].quantile(0.50):8.2f}")
print(f"75th %ile:   ‚Çπ{df_ctc['total_ctc'].quantile(0.75):8.2f}")
print(f"90th %ile:   ‚Çπ{df_ctc['total_ctc'].quantile(0.90):8.2f}")
print(f"95th %ile:   ‚Çπ{df_ctc['total_ctc'].quantile(0.95):8.2f}")

In [None]:
# CTC distribution visualizations
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram (full range)
axes[0, 0].hist(df_ctc['total_ctc'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Total CTC (LPA)', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('CTC Distribution - All Values', fontsize=13, fontweight='bold')
axes[0, 0].axvline(df_ctc['total_ctc'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ‚Çπ{df_ctc["total_ctc"].mean():.2f}')
axes[0, 0].axvline(df_ctc['total_ctc'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ‚Çπ{df_ctc["total_ctc"].median():.2f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Histogram (filtered < 50 LPA for better view)
df_ctc_filtered = df_ctc[df_ctc['total_ctc'] < 50]
axes[0, 1].hist(df_ctc_filtered['total_ctc'], bins=40, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].set_xlabel('Total CTC (LPA)', fontsize=11)
axes[0, 1].set_ylabel('Frequency', fontsize=11)
axes[0, 1].set_title('CTC Distribution - Below 50 LPA', fontsize=13, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Box plot
axes[1, 0].boxplot(df_ctc['total_ctc'], vert=True, patch_artist=True,
                   boxprops=dict(facecolor='lightgreen', alpha=0.7),
                   medianprops=dict(color='red', linewidth=2))
axes[1, 0].set_ylabel('Total CTC (LPA)', fontsize=11)
axes[1, 0].set_title('CTC Box Plot - Identifying Outliers', fontsize=13, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Violin plot
parts = axes[1, 1].violinplot([df_ctc['total_ctc'].values], vert=True, showmeans=True, showmedians=True)
for pc in parts['bodies']:
    pc.set_facecolor('plum')
    pc.set_alpha(0.7)
axes[1, 1].set_ylabel('Total CTC (LPA)', fontsize=11)
axes[1, 1].set_title('CTC Violin Plot - Distribution Shape', fontsize=13, fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('analysis_outputs/ctc_distribution_detailed.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Year-wise CTC trends
yearly_ctc = df_ctc.groupby('batch_year')['total_ctc'].agg(['mean', 'median', 'std', 'count']).round(2)

print("\nYear-wise CTC Statistics:")
print("=" * 80)
print(yearly_ctc)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Line plot for mean and median
axes[0].plot(yearly_ctc.index, yearly_ctc['mean'], marker='o', linewidth=2, 
             markersize=8, label='Mean CTC', color='blue')
axes[0].plot(yearly_ctc.index, yearly_ctc['median'], marker='s', linewidth=2, 
             markersize=8, label='Median CTC', color='green')
axes[0].fill_between(yearly_ctc.index, yearly_ctc['mean'], alpha=0.2, color='blue')
axes[0].set_xlabel('Batch Year', fontsize=12)
axes[0].set_ylabel('CTC (LPA)', fontsize=12)
axes[0].set_title('Average CTC Trend Over Years', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)

# Bar plot with error bars
x = range(len(yearly_ctc))
axes[1].bar(x, yearly_ctc['mean'], yerr=yearly_ctc['std'], 
            color='teal', alpha=0.7, capsize=5, edgecolor='black')
axes[1].set_xticks(x)
axes[1].set_xticklabels(yearly_ctc.index)
axes[1].set_xlabel('Batch Year', fontsize=12)
axes[1].set_ylabel('Average CTC (LPA)', fontsize=12)
axes[1].set_title('Mean CTC with Std Deviation', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('analysis_outputs/yearly_ctc_trends.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Tier-wise CTC analysis
tier_ctc = df_ctc.groupby('placement_tier')['total_ctc'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)

print("\nTier-wise CTC Statistics:")
print("=" * 80)
print(tier_ctc)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot
tier_ctc['mean'].plot(kind='barh', ax=axes[0], color='purple', edgecolor='black')
axes[0].set_xlabel('Average CTC (LPA)', fontsize=12)
axes[0].set_ylabel('Placement Tier', fontsize=12)
axes[0].set_title('Average CTC by Placement Tier', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Box plot
tier_order = tier_ctc.index.tolist()
df_ctc_tier = df_ctc[df_ctc['placement_tier'].isin(tier_order)]
sns.boxplot(data=df_ctc_tier, y='placement_tier', x='total_ctc', 
            order=tier_order, ax=axes[1], palette='Set2')
axes[1].set_xlabel('Total CTC (LPA)', fontsize=12)
axes[1].set_ylabel('Placement Tier', fontsize=12)
axes[1].set_title('CTC Distribution by Tier', fontsize=14, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('analysis_outputs/tier_wise_ctc.png', dpi=300, bbox_inches='tight')
plt.show()

## 6. Company Analysis

In [None]:
# Top companies by offer count
df_offers = df[df['num_offers_total'].notna() & (df['num_offers_total'] > 0)]
top_recruiters = df_offers.groupby('company_name')['num_offers_total'].sum().nlargest(20).sort_values()

print("Top 20 Recruiters by Number of Offers:")
print("=" * 80)
for idx, (company, offers) in enumerate(top_recruiters.sort_values(ascending=False).items(), 1):
    print(f"{idx:2d}. {company:40s}: {offers:5.0f} offers")

plt.figure(figsize=(14, 10))
top_recruiters.plot(kind='barh', color='teal', edgecolor='black')
plt.xlabel('Total Number of Offers', fontsize=12)
plt.ylabel('Company Name', fontsize=12)
plt.title('Top 20 Recruiting Companies by Offer Count', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('analysis_outputs/top_recruiters.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Top companies by average CTC (minimum 2 placements for reliability)
company_ctc = df_ctc.groupby('company_name').agg({
    'total_ctc': ['mean', 'median', 'count']
}).reset_index()
company_ctc.columns = ['company_name', 'avg_ctc', 'median_ctc', 'count']
company_ctc = company_ctc[company_ctc['count'] >= 2]  # At least 2 placements
top_paying = company_ctc.nlargest(20, 'avg_ctc').sort_values('avg_ctc')

print("\nTop 20 Highest Paying Companies (min 2 placements):")
print("=" * 80)
for idx, row in top_paying.sort_values('avg_ctc', ascending=False).iterrows():
    print(f"{row.name+1:2d}. {row['company_name']:40s}: ‚Çπ{row['avg_ctc']:7.2f} LPA (n={int(row['count'])})")

plt.figure(figsize=(14, 10))
plt.barh(range(len(top_paying)), top_paying['avg_ctc'], color='gold', edgecolor='black')
plt.yticks(range(len(top_paying)), top_paying['company_name'])
plt.xlabel('Average CTC (LPA)', fontsize=12)
plt.ylabel('Company Name', fontsize=12)
plt.title('Top 20 Highest Paying Companies', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('analysis_outputs/top_paying_companies.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Role Type Analysis

In [None]:
# Role distribution
role_dist = df['role_type'].value_counts()

print("Role Type Distribution:")
print("=" * 80)
for role, count in role_dist.items():
    pct = count / len(df) * 100
    print(f"{role:30s}: {count:4d} ({pct:5.1f}%)")

fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# Horizontal bar chart
role_dist.plot(kind='barh', ax=axes[0], color='mediumpurple', edgecolor='black')
axes[0].set_xlabel('Number of Records', fontsize=12)
axes[0].set_ylabel('Role Type', fontsize=12)
axes[0].set_title('Job Role Distribution', fontsize=14, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Pie chart (top 10)
top_roles = role_dist.head(10)
axes[1].pie(top_roles.values, labels=top_roles.index, autopct='%1.1f%%',
            startangle=45, textprops={'fontsize': 9})
axes[1].set_title('Top 10 Role Types Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('analysis_outputs/role_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Role-wise CTC analysis
role_ctc = df_ctc.groupby('role_type')['total_ctc'].agg(['mean', 'median', 'count']).sort_values('mean', ascending=False)

print("\nRole-wise CTC Statistics:")
print("=" * 80)
print(role_ctc)

plt.figure(figsize=(14, 8))
role_ctc['mean'].plot(kind='barh', color='orange', edgecolor='black')
plt.xlabel('Average CTC (LPA)', fontsize=12)
plt.ylabel('Role Type', fontsize=12)
plt.title('Average CTC by Role Type', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('analysis_outputs/role_wise_ctc.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. CGPA Cutoff Analysis

In [None]:
# CGPA analysis
df_cgpa = df[df['cgpa_cutoff'].notna()]

print(f"Records with CGPA cutoff: {len(df_cgpa):,} ({len(df_cgpa)/len(df)*100:.1f}%)")
print("\nCGPA Cutoff Statistics:")
print("=" * 60)
print(f"Mean:    {df_cgpa['cgpa_cutoff'].mean():.2f}")
print(f"Median:  {df_cgpa['cgpa_cutoff'].median():.2f}")
print(f"Std Dev: {df_cgpa['cgpa_cutoff'].std():.2f}")
print(f"Min:     {df_cgpa['cgpa_cutoff'].min():.2f}")
print(f"Max:     {df_cgpa['cgpa_cutoff'].max():.2f}")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Histogram
axes[0, 0].hist(df_cgpa['cgpa_cutoff'], bins=20, color='lightblue', edgecolor='black')
axes[0, 0].set_xlabel('CGPA Cutoff', fontsize=11)
axes[0, 0].set_ylabel('Frequency', fontsize=11)
axes[0, 0].set_title('CGPA Cutoff Distribution', fontsize=13, fontweight='bold')
axes[0, 0].axvline(df_cgpa['cgpa_cutoff'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df_cgpa["cgpa_cutoff"].mean():.2f}')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# CGPA by tier
tier_cgpa = df_cgpa.groupby('placement_tier')['cgpa_cutoff'].mean().sort_values(ascending=False)
tier_cgpa.plot(kind='barh', ax=axes[0, 1], color='salmon', edgecolor='black')
axes[0, 1].set_xlabel('Average CGPA Cutoff', fontsize=11)
axes[0, 1].set_ylabel('Placement Tier', fontsize=11)
axes[0, 1].set_title('Average CGPA Cutoff by Tier', fontsize=13, fontweight='bold')
axes[0, 1].grid(axis='x', alpha=0.3)

# CGPA vs CTC scatter (for records with both)
df_both = df[(df['cgpa_cutoff'].notna()) & (df['total_ctc'].notna()) & (df['total_ctc'] > 0)]
if len(df_both) > 0:
    axes[1, 0].scatter(df_both['cgpa_cutoff'], df_both['total_ctc'], alpha=0.5, color='green')
    axes[1, 0].set_xlabel('CGPA Cutoff', fontsize=11)
    axes[1, 0].set_ylabel('Total CTC (LPA)', fontsize=11)
    axes[1, 0].set_title('CGPA Cutoff vs CTC', fontsize=13, fontweight='bold')
    axes[1, 0].grid(alpha=0.3)
    
    # Calculate and display correlation
    corr = df_both['cgpa_cutoff'].corr(df_both['total_ctc'])
    axes[1, 0].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                    transform=axes[1, 0].transAxes, fontsize=11, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Box plot by year
df_cgpa_valid = df_cgpa[df_cgpa['batch_year'].isin([2022, 2023, 2024, 2025, 2026])]
sns.boxplot(data=df_cgpa_valid, x='batch_year', y='cgpa_cutoff', ax=axes[1, 1], palette='Set3')
axes[1, 1].set_xlabel('Batch Year', fontsize=11)
axes[1, 1].set_ylabel('CGPA Cutoff', fontsize=11)
axes[1, 1].set_title('CGPA Cutoff Distribution by Year', fontsize=13, fontweight='bold')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('analysis_outputs/cgpa_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Additional Benefits Analysis

In [None]:
# Benefits analysis
benefits = {
    'Internship Program': df['has_internship'].sum(),
    'Stocks/ESOPs': df['has_stocks'].sum(),
    'Joining Bonus': df['has_joining_bonus'].sum()
}

print("Companies Offering Additional Benefits:")
print("=" * 60)
for benefit, count in benefits.items():
    pct = count / len(df) * 100
    print(f"{benefit:20s}: {count:4d} ({pct:5.1f}%)")

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
axes[0].bar(benefits.keys(), benefits.values(), color=['skyblue', 'lightgreen', 'lightcoral'], edgecolor='black')
axes[0].set_ylabel('Number of Companies', fontsize=12)
axes[0].set_title('Additional Benefits Offered', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)
for i, v in enumerate(benefits.values()):
    axes[0].text(i, v + 10, str(v), ha='center', fontweight='bold')

# Percentage chart
pcts = [v/len(df)*100 for v in benefits.values()]
axes[1].barh(list(benefits.keys()), pcts, color=['skyblue', 'lightgreen', 'lightcoral'], edgecolor='black')
axes[1].set_xlabel('Percentage of Total Records (%)', fontsize=12)
axes[1].set_title('Benefits Coverage Percentage', fontsize=14, fontweight='bold')
axes[1].grid(axis='x', alpha=0.3)
for i, v in enumerate(pcts):
    axes[1].text(v + 0.5, i, f'{v:.1f}%', va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('analysis_outputs/additional_benefits.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Key Insights Summary

In [None]:
# Generate comprehensive insights
insights = {
    'total_records': len(df),
    'years_covered': f"{df['batch_year'].min()}-{df['batch_year'].max()}",
    'unique_companies': df['company_name'].nunique(),
    'unique_colleges': df['college'].nunique(),
    'avg_ctc': df_ctc['total_ctc'].mean() if len(df_ctc) > 0 else 0,
    'median_ctc': df_ctc['total_ctc'].median() if len(df_ctc) > 0 else 0,
    'max_ctc': df_ctc['total_ctc'].max() if len(df_ctc) > 0 else 0,
    'top_recruiter': top_recruiters.index[-1] if len(top_recruiters) > 0 else 'N/A',
    'top_recruiter_offers': top_recruiters.iloc[-1] if len(top_recruiters) > 0 else 0,
    'highest_paying': top_paying.iloc[-1]['company_name'] if len(top_paying) > 0 else 'N/A',
    'highest_avg_ctc': top_paying.iloc[-1]['avg_ctc'] if len(top_paying) > 0 else 0,
    'most_common_role': role_dist.index[0],
    'avg_cgpa': df_cgpa['cgpa_cutoff'].mean() if len(df_cgpa) > 0 else 0,
    'data_quality_ctc': len(df_ctc)/len(df)*100,
    'data_quality_cgpa': len(df_cgpa)/len(df)*100
}

print("="*80)
print(" "*25 + "KEY INSIGHTS SUMMARY")
print("="*80)
print(f"\nüìä DATASET OVERVIEW:")
print(f"   ‚Ä¢ Total Records: {insights['total_records']:,}")
print(f"   ‚Ä¢ Coverage Period: {insights['years_covered']}")
print(f"   ‚Ä¢ Unique Companies: {insights['unique_companies']:,}")
print(f"   ‚Ä¢ Unique Colleges: {insights['unique_colleges']}")

print(f"\nüí∞ COMPENSATION INSIGHTS:")
print(f"   ‚Ä¢ Average CTC: ‚Çπ{insights['avg_ctc']:.2f} LPA")
print(f"   ‚Ä¢ Median CTC: ‚Çπ{insights['median_ctc']:.2f} LPA")
print(f"   ‚Ä¢ Highest Package: ‚Çπ{insights['max_ctc']:.2f} LPA")

print(f"\nüèÜ TOP PERFORMERS:")
print(f"   ‚Ä¢ Top Recruiter: {insights['top_recruiter']} ({insights['top_recruiter_offers']:.0f} offers)")
print(f"   ‚Ä¢ Highest Paying: {insights['highest_paying']} (‚Çπ{insights['highest_avg_ctc']:.2f} LPA avg)")

print(f"\nüìà PLACEMENT TRENDS:")
print(f"   ‚Ä¢ Most Common Role: {insights['most_common_role']}")
print(f"   ‚Ä¢ Average CGPA Cutoff: {insights['avg_cgpa']:.2f}")

print(f"\n‚úÖ DATA QUALITY:")
print(f"   ‚Ä¢ CTC Data Completeness: {insights['data_quality_ctc']:.1f}%")
print(f"   ‚Ä¢ CGPA Data Completeness: {insights['data_quality_cgpa']:.1f}%")
print("="*80)

# Save insights to JSON
with open('analysis_outputs/eda_insights.json', 'w') as f:
    json.dump(insights, f, indent=4)
print("\n‚úì Insights saved to: analysis_outputs/eda_insights.json")

## 11. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['batch_year', 'total_ctc', 'base_salary', 'internship_stipend', 
                'stocks_esops', 'joining_bonus', 'cgpa_cutoff', 'num_offers_total',
                'has_internship', 'has_stocks', 'has_joining_bonus']

# Filter columns that exist
available_cols = [col for col in numeric_cols if col in df.columns]
df_corr = df[available_cols].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(df_corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix - Numeric Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('analysis_outputs/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nTop Positive Correlations (excluding self-correlation):")
print("="*60)
corr_pairs = df_corr.unstack()
corr_pairs = corr_pairs[corr_pairs < 1]
print(corr_pairs.sort_values(ascending=False).head(10))

## Conclusion

This comprehensive EDA has provided valuable insights into:
- **Temporal patterns** in placement data across 2022-2026
- **Compensation trends** and distribution across tiers and roles
- **Top recruiting companies** and their offering patterns
- **Role-wise analysis** showing demand and compensation
- **CGPA requirements** and their relationship with compensation
- **Additional benefits** provided by companies

**Next Steps:**
1. Proceed to Temporal Trend Analysis (Notebook 02)
2. Conduct Cross-College Comparative Analysis (Notebook 03)
3. Perform Statistical Testing (Notebook 04)
4. Build Predictive Models (Notebook 05)