# üéì Project 2: Student Performance Analysis

Welcome to your advanced data analysis project! In this notebook, you'll analyze student academic performance using statistical methods and advanced visualizations.

## üéØ Project Objectives
- Analyze student performance across subjects
- Perform statistical significance testing
- Identify top performers and at-risk students
- Generate educational insights and recommendations
- Create comprehensive performance reports

## üìã Advanced Skills You'll Practice
- Statistical hypothesis testing
- Correlation analysis
- Performance benchmarking
- Educational data mining
- Advanced data visualization

Let's begin this comprehensive analysis! üöÄ

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('default')
sns.set_palette("husl")

print("üìö Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"SciPy version: {stats.__version__ if hasattr(stats, '__version__') else 'Available'}")

## üì• Step 1: Load and Prepare Student Data

Let's load our student performance dataset and prepare it for analysis.

In [None]:
# Load student data
try:
    students = pd.read_csv('../datasets/student_grades.csv')
    print(f"‚úÖ Successfully loaded data for {len(students)} students")
except FileNotFoundError:
    print("‚ùå Dataset not found. Creating sample data for demonstration...")
    # Create comprehensive sample data
    np.random.seed(42)
    students = pd.DataFrame({
        'Student_ID': range(1, 101),
        'Name': [f'Student_{i}' for i in range(1, 101)],
        'Math': np.random.randint(60, 100, 100),
        'Science': np.random.randint(55, 95, 100),
        'English': np.random.randint(65, 100, 100),
        'Grade_Level': np.random.choice(['9th', '10th', '11th', '12th'], 100),
        'Gender': np.random.choice(['Male', 'Female'], 100)
    })
    print(f"üìä Created sample dataset with {len(students)} students")

# Define subject columns
subjects = ['Math', 'Science', 'English']

# Calculate derived metrics
students['Total_Score'] = students[subjects].sum(axis=1)
students['Average_Score'] = students[subjects].mean(axis=1)

# Grade classification function
def assign_letter_grade(score):
    if score >= 90: return 'A'
    elif score >= 80: return 'B'
    elif score >= 70: return 'C'
    elif score >= 60: return 'D'
    else: return 'F'

students['Letter_Grade'] = students['Average_Score'].apply(assign_letter_grade)

print("\nüìã Dataset Overview:")
print(f"Shape: {students.shape}")
print(f"Columns: {list(students.columns)}")
print("\nüîç First 5 students:")
print(students.head())

In [None]:
# Data quality and basic statistics
print("üîç Data Quality Check:")
print(f"Missing values:\n{students.isnull().sum()}")
print(f"\nGrade level distribution:\n{students['Grade_Level'].value_counts()}")
if 'Gender' in students.columns:
    print(f"\nGender distribution:\n{students['Gender'].value_counts()}")

print(f"\nüìä Performance Overview:")
print(f"Average score across all subjects: {students['Average_Score'].mean():.2f}")
print(f"Highest average score: {students['Average_Score'].max():.2f}")
print(f"Lowest average score: {students['Average_Score'].min():.2f}")
print(f"Standard deviation: {students['Average_Score'].std():.2f}")

## üìö Step 2: Subject Performance Analysis

Let's analyze performance across different subjects and identify patterns.

In [None]:
# Subject analysis
print("üìö SUBJECT PERFORMANCE ANALYSIS")
print("=" * 50)

subject_stats = students[subjects].describe().round(2)
print("Subject Statistics:")
print(subject_stats)

# Find best and worst subjects
subject_means = students[subjects].mean()
best_subject = subject_means.idxmax()
worst_subject = subject_means.idxmin()

print(f"\nüèÜ Best performing subject: {best_subject} (avg: {subject_means[best_subject]:.2f})")
print(f"üìâ Most challenging subject: {worst_subject} (avg: {subject_means[worst_subject]:.2f})")
print(f"Performance gap: {subject_means[best_subject] - subject_means[worst_subject]:.2f} points")

# Subject correlations
correlations = students[subjects].corr()
print("\nüîó Subject Correlations:")
print(correlations.round(3))

# Find strongest correlation
corr_values = correlations.unstack().drop_duplicates()
strongest_corr = corr_values[corr_values < 1.0].max()
strongest_pair = corr_values[corr_values == strongest_corr].index[0]
print(f"\nStrongest correlation: {strongest_pair[0]} & {strongest_pair[1]} (r = {strongest_corr:.3f})")

In [None]:
# Subject performance visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Subject Performance Analysis', fontsize=16, fontweight='bold')

# 1. Subject means comparison
subject_means.plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Average Scores by Subject')
axes[0, 0].set_ylabel('Average Score')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# 2. Subject distribution box plots
students[subjects].boxplot(ax=axes[0, 1])
axes[0, 1].set_title('Score Distribution by Subject')
axes[0, 1].set_ylabel('Scores')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Correlation heatmap
correlation_matrix = students[subjects + ['Average_Score']].corr()
im = axes[1, 0].imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
axes[1, 0].set_xticks(range(len(correlation_matrix.columns)))
axes[1, 0].set_yticks(range(len(correlation_matrix.columns)))
axes[1, 0].set_xticklabels(correlation_matrix.columns, rotation=45)
axes[1, 0].set_yticklabels(correlation_matrix.columns)
axes[1, 0].set_title('Subject Correlation Matrix')

# Add correlation values
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        axes[1, 0].text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                       ha='center', va='center', color='white', fontweight='bold')

# 4. Subject performance histogram
for subject in subjects:
    axes[1, 1].hist(students[subject], alpha=0.7, label=subject, bins=15)
axes[1, 1].set_title('Score Distribution by Subject')
axes[1, 1].set_xlabel('Score')
axes[1, 1].set_ylabel('Number of Students')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("üìö Subject analysis completed!")

## üéì Step 3: Grade Level Analysis

Let's analyze performance differences across grade levels and test for statistical significance.

In [None]:
# Grade level analysis
print("üéì GRADE LEVEL ANALYSIS")
print("=" * 50)

grade_level_stats = students.groupby('Grade_Level').agg({
    'Average_Score': ['mean', 'std', 'count'],
    'Math': 'mean',
    'Science': 'mean',
    'English': 'mean'
}).round(2)

print("Performance by Grade Level:")
print(grade_level_stats)

# Statistical significance test (ANOVA)
grade_groups = [group['Average_Score'].values for name, group in students.groupby('Grade_Level')]
f_stat, p_value = stats.f_oneway(*grade_groups)

print(f"\nüìä ANOVA Test Results:")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")
print(f"Significant difference between grade levels: {'Yes' if p_value < 0.05 else 'No'}")

if p_value < 0.05:
    print("\nüîç Post-hoc analysis (pairwise comparisons):")
    grade_levels = students['Grade_Level'].unique()
    for i, grade1 in enumerate(grade_levels):
        for grade2 in grade_levels[i+1:]:
            group1 = students[students['Grade_Level'] == grade1]['Average_Score']
            group2 = students[students['Grade_Level'] == grade2]['Average_Score']
            t_stat, t_p = stats.ttest_ind(group1, group2)
            if t_p < 0.05:
                print(f"   {grade1} vs {grade2}: Significant difference (p = {t_p:.4f})")

# Best and worst performing grade levels
grade_means = students.groupby('Grade_Level')['Average_Score'].mean().sort_values(ascending=False)
print(f"\nüèÜ Grade level rankings:")
for i, (grade, avg) in enumerate(grade_means.items(), 1):
    print(f"{i}. {grade}: {avg:.2f} average")

In [None]:
# Grade level visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Grade Level Performance Analysis', fontsize=16, fontweight='bold')

# 1. Average performance by grade level
grade_level_avg = students.groupby('Grade_Level')['Average_Score'].mean().sort_values(ascending=False)
axes[0, 0].bar(grade_level_avg.index, grade_level_avg.values, color='lightgreen')
axes[0, 0].set_title('Average Performance by Grade Level')
axes[0, 0].set_ylabel('Average Score')
axes[0, 0].tick_params(axis='x', rotation=45)

# Add value labels
for i, v in enumerate(grade_level_avg.values):
    axes[0, 0].text(i, v + 1, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# 2. Performance distribution by grade level
grade_data = [students[students['Grade_Level'] == grade]['Average_Score'].values 
              for grade in students['Grade_Level'].unique()]
axes[0, 1].boxplot(grade_data, labels=students['Grade_Level'].unique())
axes[0, 1].set_title('Score Distribution by Grade Level')
axes[0, 1].set_ylabel('Average Score')
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Subject performance by grade level
grade_subject_means = students.groupby('Grade_Level')[subjects].mean()
x = np.arange(len(subjects))
width = 0.2
colors = ['red', 'blue', 'green', 'orange']

for i, (grade, scores) in enumerate(grade_subject_means.iterrows()):
    axes[1, 0].bar(x + i*width, scores.values, width, label=grade, 
                   color=colors[i % len(colors)], alpha=0.8)

axes[1, 0].set_title('Subject Performance by Grade Level')
axes[1, 0].set_xlabel('Subjects')
axes[1, 0].set_ylabel('Average Score')
axes[1, 0].set_xticks(x + width * 1.5)
axes[1, 0].set_xticklabels(subjects)
axes[1, 0].legend()

# 4. Grade distribution by grade level
grade_distribution = pd.crosstab(students['Grade_Level'], students['Letter_Grade'], normalize='index') * 100
grade_distribution.plot(kind='bar', stacked=True, ax=axes[1, 1], 
                       color=['red', 'orange', 'yellow', 'lightgreen', 'green'])
axes[1, 1].set_title('Letter Grade Distribution by Grade Level')
axes[1, 1].set_xlabel('Grade Level')
axes[1, 1].set_ylabel('Percentage of Students')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend(title='Letter Grade', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

print("üéì Grade level analysis completed!")

## üë• Step 4: Gender Analysis (if available)

Let's analyze performance differences by gender and test for statistical significance.

In [None]:
# Gender analysis (if gender data is available)
if 'Gender' in students.columns:
    print("üë• GENDER ANALYSIS")
    print("=" * 50)
    
    gender_stats = students.groupby('Gender').agg({
        'Average_Score': ['mean', 'std', 'count'],
        'Math': 'mean',
        'Science': 'mean',
        'English': 'mean'
    }).round(2)
    
    print("Performance by Gender:")
    print(gender_stats)
    
    # T-test for gender differences
    male_scores = students[students['Gender'] == 'Male']['Average_Score']
    female_scores = students[students['Gender'] == 'Female']['Average_Score']
    
    t_stat, p_value = stats.ttest_ind(male_scores, female_scores)
    
    print(f"\nüìä T-test Results:")
    print(f"T-statistic: {t_stat:.4f}")
    print(f"P-value: {p_value:.4f}")
    print(f"Significant gender difference: {'Yes' if p_value < 0.05 else 'No'}")
    
    # Effect size (Cohen's d)
    pooled_std = np.sqrt(((len(male_scores) - 1) * male_scores.var() + 
                         (len(female_scores) - 1) * female_scores.var()) / 
                        (len(male_scores) + len(female_scores) - 2))
    cohens_d = (male_scores.mean() - female_scores.mean()) / pooled_std
    
    print(f"Effect size (Cohen's d): {cohens_d:.4f}")
    if abs(cohens_d) < 0.2:
        effect_size = "Small"
    elif abs(cohens_d) < 0.5:
        effect_size = "Medium"
    else:
        effect_size = "Large"
    print(f"Effect size interpretation: {effect_size}")
    
    # Subject-specific gender analysis
    print(f"\nüìö Subject-specific gender differences:")
    for subject in subjects:
        male_subject = students[students['Gender'] == 'Male'][subject]
        female_subject = students[students['Gender'] == 'Female'][subject]
        t_stat_subj, p_val_subj = stats.ttest_ind(male_subject, female_subject)
        
        print(f"{subject}: Male avg = {male_subject.mean():.2f}, Female avg = {female_subject.mean():.2f}")
        print(f"   Significant difference: {'Yes' if p_val_subj < 0.05 else 'No'} (p = {p_val_subj:.4f})")
else:
    print("üë• Gender data not available in this dataset")

In [None]:
# Gender visualization (if available)
if 'Gender' in students.columns:
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Gender Performance Analysis', fontsize=16, fontweight='bold')
    
    # 1. Average performance by gender
    gender_avg = students.groupby('Gender')['Average_Score'].mean()
    axes[0, 0].bar(gender_avg.index, gender_avg.values, color=['lightblue', 'lightpink'])
    axes[0, 0].set_title('Average Performance by Gender')
    axes[0, 0].set_ylabel('Average Score')
    
    # Add value labels
    for i, v in enumerate(gender_avg.values):
        axes[0, 0].text(i, v + 1, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')
    
    # 2. Subject performance by gender
    gender_subject_means = students.groupby('Gender')[subjects].mean()
    x = np.arange(len(subjects))
    width = 0.35
    
    axes[0, 1].bar(x - width/2, gender_subject_means.loc['Male'], width, 
                   label='Male', alpha=0.8, color='lightblue')
    axes[0, 1].bar(x + width/2, gender_subject_means.loc['Female'], width, 
                   label='Female', alpha=0.8, color='lightpink')
    axes[0, 1].set_title('Subject Performance by Gender')
    axes[0, 1].set_xlabel('Subjects')
    axes[0, 1].set_ylabel('Average Score')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(subjects)
    axes[0, 1].legend()
    
    # 3. Score distribution by gender
    male_scores = students[students['Gender'] == 'Male']['Average_Score']
    female_scores = students[students['Gender'] == 'Female']['Average_Score']
    
    axes[1, 0].hist(male_scores, alpha=0.7, label='Male', bins=15, color='lightblue')
    axes[1, 0].hist(female_scores, alpha=0.7, label='Female', bins=15, color='lightpink')
    axes[1, 0].set_title('Score Distribution by Gender')
    axes[1, 0].set_xlabel('Average Score')
    axes[1, 0].set_ylabel('Number of Students')
    axes[1, 0].legend()
    
    # 4. Grade distribution by gender
    gender_grade_dist = pd.crosstab(students['Gender'], students['Letter_Grade'], normalize='index') * 100
    gender_grade_dist.plot(kind='bar', ax=axes[1, 1], 
                          color=['red', 'orange', 'yellow', 'lightgreen', 'green'])
    axes[1, 1].set_title('Letter Grade Distribution by Gender')
    axes[1, 1].set_xlabel('Gender')
    axes[1, 1].set_ylabel('Percentage of Students')
    axes[1, 1].tick_params(axis='x', rotation=0)
    axes[1, 1].legend(title='Letter Grade', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
    
    print("üë• Gender analysis completed!")
else:
    print("Skipping gender visualization - data not available")

## üèÜ Step 5: Identify Top Performers and At-Risk Students

Let's identify students who excel and those who need additional support.

In [None]:
# Top performers analysis
print("üèÜ TOP PERFORMERS ANALYSIS")
print("=" * 50)

# Overall top performers
top_students = students.nlargest(10, 'Average_Score')[['Name', 'Average_Score', 'Grade_Level', 'Letter_Grade'] + subjects]
print("Top 10 Students Overall:")
print(top_students.to_string(index=False))

# Top performers by subject
print(f"\nüìö Top Performers by Subject:")
for subject in subjects:
    top_in_subject = students.nlargest(3, subject)[['Name', subject, 'Grade_Level']]
    print(f"\n{subject} Top 3:")
    for i, (_, student) in enumerate(top_in_subject.iterrows(), 1):
        print(f"  {i}. {student['Name']}: {student[subject]} ({student['Grade_Level']})")

# Excellence analysis
excellent_students = students[students['Letter_Grade'] == 'A']
print(f"\n‚≠ê Excellence Statistics:")
print(f"Students with A grade: {len(excellent_students)} ({len(excellent_students)/len(students)*100:.1f}%)")
if len(excellent_students) > 0:
    print(f"Average score of A students: {excellent_students['Average_Score'].mean():.2f}")
    print(f"Grade level distribution of A students:")
    print(excellent_students['Grade_Level'].value_counts())

In [None]:
# At-risk students analysis
print("üö® AT-RISK STUDENTS ANALYSIS")
print("=" * 50)

# Students with average below 70 (C grade threshold)
at_risk = students[students['Average_Score'] < 70][['Name', 'Average_Score', 'Grade_Level'] + subjects]

if len(at_risk) > 0:
    print(f"Students needing support: {len(at_risk)} ({len(at_risk)/len(students)*100:.1f}%)")
    print("\nAt-risk students details:")
    print(at_risk.to_string(index=False))
    
    # Subject-specific support needed
    print(f"\nüìö Subject-specific support needed:")
    for subject in subjects:
        weak_in_subject = students[students[subject] < 70]
        if len(weak_in_subject) > 0:
            print(f"{subject}: {len(weak_in_subject)} students ({len(weak_in_subject)/len(students)*100:.1f}%)")
            print(f"   Average score: {weak_in_subject[subject].mean():.2f}")
            print(f"   Lowest score: {weak_in_subject[subject].min():.2f}")
        else:
            print(f"{subject}: No students below 70")
    
    # Grade level analysis of at-risk students
    print(f"\nüéì At-risk students by grade level:")
    at_risk_by_grade = at_risk['Grade_Level'].value_counts()
    for grade, count in at_risk_by_grade.items():
        total_in_grade = len(students[students['Grade_Level'] == grade])
        percentage = count / total_in_grade * 100
        print(f"{grade}: {count}/{total_in_grade} students ({percentage:.1f}%)")
else:
    print("üéâ Great news! No students are currently at risk (all above 70 average).")

# Identify students with high variance (inconsistent performance)
students['Score_Variance'] = students[subjects].var(axis=1)
inconsistent_students = students.nlargest(5, 'Score_Variance')[['Name', 'Score_Variance'] + subjects + ['Average_Score']]

print(f"\nüìä Students with inconsistent performance (high variance):")
print(inconsistent_students.to_string(index=False))
print("\nNote: These students show large differences between subjects and may benefit from targeted support.")

## üìä Step 6: Grade Distribution Analysis

Let's analyze the overall grade distribution and performance patterns.

In [None]:
# Grade distribution analysis
print("üìä GRADE DISTRIBUTION ANALYSIS")
print("=" * 50)

grade_dist = students['Letter_Grade'].value_counts().sort_index()
grade_percentages = (grade_dist / len(students) * 100).round(2)

print("Overall Grade Distribution:")
for grade, count in grade_dist.items():
    percentage = grade_percentages[grade]
    print(f"Grade {grade}: {count} students ({percentage}%)")

# Grade distribution by grade level
if len(students['Grade_Level'].unique()) > 1:
    print("\nüéì Grade Distribution by Grade Level:")
    grade_level_dist = pd.crosstab(students['Grade_Level'], students['Letter_Grade'], normalize='index') * 100
    print(grade_level_dist.round(2))

# Performance benchmarks
print(f"\nüìà Performance Benchmarks:")
print(f"Students meeting/exceeding expectations (C+ or better): {len(students[students['Average_Score'] >= 70])} ({len(students[students['Average_Score'] >= 70])/len(students)*100:.1f}%)")
print(f"Students exceeding expectations (B+ or better): {len(students[students['Average_Score'] >= 80])} ({len(students[students['Average_Score'] >= 80])/len(students)*100:.1f}%)")
print(f"Students showing excellence (A grade): {len(students[students['Average_Score'] >= 90])} ({len(students[students['Average_Score'] >= 90])/len(students)*100:.1f}%)")

# Calculate class performance metrics
class_average = students['Average_Score'].mean()
class_median = students['Average_Score'].median()
class_std = students['Average_Score'].std()

print(f"\nüìä Class Performance Metrics:")
print(f"Class average: {class_average:.2f}")
print(f"Class median: {class_median:.2f}")
print(f"Standard deviation: {class_std:.2f}")
print(f"Performance consistency: {'High' if class_std < 10 else 'Moderate' if class_std < 15 else 'Low'}")

In [None]:
# Comprehensive grade distribution visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Comprehensive Grade Distribution Analysis', fontsize=16, fontweight='bold')

# 1. Overall grade distribution
grade_dist.plot(kind='bar', ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Overall Grade Distribution')
axes[0, 0].set_xlabel('Letter Grade')
axes[0, 0].set_ylabel('Number of Students')
axes[0, 0].tick_params(axis='x', rotation=0)

# Add percentage labels
for i, (grade, count) in enumerate(grade_dist.items()):
    percentage = grade_percentages[grade]
    axes[0, 0].text(i, count + 0.5, f'{percentage}%', ha='center', va='bottom', fontweight='bold')

# 2. Score distribution histogram
axes[0, 1].hist(students['Average_Score'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].axvline(class_average, color='red', linestyle='--', label=f'Mean: {class_average:.1f}')
axes[0, 1].axvline(class_median, color='blue', linestyle='--', label=f'Median: {class_median:.1f}')
axes[0, 1].set_title('Score Distribution')
axes[0, 1].set_xlabel('Average Score')
axes[0, 1].set_ylabel('Number of Students')
axes[0, 1].legend()

# 3. Performance by grade level
grade_level_avg = students.groupby('Grade_Level')['Average_Score'].mean().sort_values(ascending=False)
axes[0, 2].bar(grade_level_avg.index, grade_level_avg.values, color='orange')
axes[0, 2].set_title('Average Performance by Grade Level')
axes[0, 2].set_ylabel('Average Score')
axes[0, 2].tick_params(axis='x', rotation=45)

# 4. Subject performance comparison
subject_means.plot(kind='bar', ax=axes[1, 0], color='lightcoral')
axes[1, 0].set_title('Average Performance by Subject')
axes[1, 0].set_ylabel('Average Score')
axes[1, 0].tick_params(axis='x', rotation=45)

# 5. Top vs Bottom performers
top_10_pct = students.nlargest(int(len(students) * 0.1), 'Average_Score')['Average_Score'].mean()
bottom_10_pct = students.nsmallest(int(len(students) * 0.1), 'Average_Score')['Average_Score'].mean()
middle_80_pct = students.iloc[int(len(students) * 0.1):int(len(students) * 0.9)]['Average_Score'].mean()

performance_groups = ['Top 10%', 'Middle 80%', 'Bottom 10%']
performance_scores = [top_10_pct, middle_80_pct, bottom_10_pct]
colors = ['green', 'yellow', 'red']

axes[1, 1].bar(performance_groups, performance_scores, color=colors, alpha=0.7)
axes[1, 1].set_title('Performance by Student Groups')
axes[1, 1].set_ylabel('Average Score')

# Add value labels
for i, v in enumerate(performance_scores):
    axes[1, 1].text(i, v + 1, f'{v:.1f}', ha='center', va='bottom', fontweight='bold')

# 6. Grade distribution pie chart
axes[1, 2].pie(grade_dist.values, labels=grade_dist.index, autopct='%1.1f%%', startangle=90)
axes[1, 2].set_title('Grade Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

print("üìä Grade distribution analysis completed!")

## üí° Step 7: Generate Educational Insights & Recommendations

Based on our comprehensive analysis, let's generate actionable educational insights.

In [None]:
# Generate comprehensive educational insights
print("üí° EDUCATIONAL INSIGHTS & RECOMMENDATIONS")
print("=" * 60)

# Subject-based insights
print("üìö SUBJECT INSIGHTS:")
print(f"1. {best_subject} is the strongest subject with an average of {subject_means[best_subject]:.2f}")
print(f"2. {worst_subject} needs attention with an average of {subject_means[worst_subject]:.2f}")
print(f"3. Subject performance gap: {subject_means[best_subject] - subject_means[worst_subject]:.2f} points")

# Find the most correlated subjects
max_corr = 0
corr_subjects = None
for i in range(len(subjects)):
    for j in range(i+1, len(subjects)):
        corr_val = correlations.iloc[i, j]
        if corr_val > max_corr:
            max_corr = corr_val
            corr_subjects = (subjects[i], subjects[j])

if corr_subjects:
    print(f"4. {corr_subjects[0]} and {corr_subjects[1]} are highly correlated (r = {max_corr:.3f})")
    print(f"   ‚Üí Integrated teaching approach recommended")

# Grade level insights
print(f"\nüéì GRADE LEVEL INSIGHTS:")
best_grade_level = grade_level_avg.index[0]
worst_grade_level = grade_level_avg.index[-1]
print(f"1. {best_grade_level} grade shows highest performance ({grade_level_avg[best_grade_level]:.2f} average)")
print(f"2. {worst_grade_level} grade needs additional support ({grade_level_avg[worst_grade_level]:.2f} average)")

if p_value < 0.05:
    print(f"3. Statistically significant differences exist between grade levels (p = {p_value:.4f})")
    print(f"   ‚Üí Grade-specific interventions recommended")
else:
    print(f"3. No significant differences between grade levels (p = {p_value:.4f})")
    print(f"   ‚Üí Consistent teaching standards across grades")

# Performance distribution insights
print(f"\nüìä PERFORMANCE DISTRIBUTION INSIGHTS:")
excellence_rate = len(students[students['Letter_Grade'] == 'A']) / len(students) * 100
proficiency_rate = len(students[students['Average_Score'] >= 70]) / len(students) * 100
at_risk_rate = len(students[students['Average_Score'] < 70]) / len(students) * 100

print(f"1. Excellence rate (A grades): {excellence_rate:.1f}%")
print(f"2. Proficiency rate (70+ average): {proficiency_rate:.1f}%")
print(f"3. At-risk rate (<70 average): {at_risk_rate:.1f}%")

# Performance consistency
if class_std < 10:
    consistency = "High - students perform similarly"
elif class_std < 15:
    consistency = "Moderate - some performance variation"
else:
    consistency = "Low - significant performance gaps exist"

print(f"4. Class performance consistency: {consistency} (œÉ = {class_std:.2f})")

# Strategic recommendations
print(f"\nüéØ STRATEGIC RECOMMENDATIONS:")
print(f"\nüìà ACADEMIC IMPROVEMENT:")
print(f"1. Focus remediation efforts on {worst_subject} - consider additional practice sessions")
print(f"2. Leverage {best_subject} success strategies for other subjects")
print(f"3. Implement peer tutoring programs pairing strong and struggling students")

if at_risk_rate > 20:
    print(f"4. HIGH PRIORITY: {at_risk_rate:.1f}% of students need immediate intervention")
elif at_risk_rate > 10:
    print(f"4. MODERATE PRIORITY: {at_risk_rate:.1f}% of students need targeted support")
else:
    print(f"4. LOW PRIORITY: Only {at_risk_rate:.1f}% of students need additional support")

print(f"\nüë• INDIVIDUALIZED SUPPORT:")
if len(at_risk) > 0:
    print(f"1. Create individualized learning plans for {len(at_risk)} at-risk students")
    print(f"2. Implement weekly progress monitoring for struggling students")
else:
    print(f"1. Maintain current support systems - no students currently at risk")

print(f"3. Establish mentorship programs for top performers to help peers")
print(f"4. Consider advanced placement opportunities for high achievers")

print(f"\nüìä MONITORING & ASSESSMENT:")
print(f"1. Implement monthly progress assessments to track improvement")
print(f"2. Create subject-specific intervention programs based on correlation analysis")
print(f"3. Develop early warning systems for identifying at-risk students")
print(f"4. Regular parent-teacher conferences for students below proficiency")

# Calculate overall educational health score
health_score = (
    (proficiency_rate / 100 * 40) +  # Proficiency rate weight
    (excellence_rate / 100 * 30) +   # Excellence rate weight
    ((100 - at_risk_rate) / 100 * 20) +  # Low at-risk rate weight
    (min(class_std, 20) / 20 * 10)   # Consistency weight (inverted)
)

print(f"\nüìä OVERALL EDUCATIONAL HEALTH SCORE: {health_score:.1f}/100")
if health_score >= 80:
    print("üü¢ Excellent educational outcomes - maintain current practices")
elif health_score >= 60:
    print("üü° Good educational outcomes - focus on identified improvement areas")
else:
    print("üî¥ Educational outcomes need improvement - implement recommendations urgently")

print(f"\nüéâ Analysis complete! Use these insights to improve student outcomes.")

## üìã Step 8: Executive Education Dashboard

Let's create a final comprehensive dashboard for educational stakeholders.

In [None]:
# Create comprehensive educational dashboard
fig = plt.figure(figsize=(20, 16))
fig.suptitle('üìä COMPREHENSIVE STUDENT PERFORMANCE DASHBOARD', fontsize=20, fontweight='bold', y=0.98)

# Create a grid layout
gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)

# Key metrics (top row)
metrics = {
    'Total Students': len(students),
    'Class Average': f"{class_average:.1f}",
    'Excellence Rate': f"{excellence_rate:.1f}%",
    'At-Risk Rate': f"{at_risk_rate:.1f}%"
}

colors = ['lightblue', 'lightgreen', 'gold', 'lightcoral']
for i, (metric, value) in enumerate(metrics.items()):
    ax = fig.add_subplot(gs[0, i])
    ax.text(0.5, 0.5, f"{metric}\n{value}", ha='center', va='center', 
            fontsize=14, fontweight='bold',
            bbox=dict(boxstyle='round,pad=0.5', facecolor=colors[i], alpha=0.8))
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.axis('off')

# Grade distribution (second row, left)
ax1 = fig.add_subplot(gs[1, :2])
grade_dist.plot(kind='bar', ax=ax1, color='skyblue')
ax1.set_title('Grade Distribution', fontweight='bold', fontsize=12)
ax1.set_xlabel('Letter Grade')
ax1.set_ylabel('Number of Students')
ax1.tick_params(axis='x', rotation=0)

# Subject performance (second row, right)
ax2 = fig.add_subplot(gs[1, 2:])
subject_means.plot(kind='bar', ax=ax2, color='lightgreen')
ax2.set_title('Subject Performance', fontweight='bold', fontsize=12)
ax2.set_ylabel('Average Score')
ax2.tick_params(axis='x', rotation=45)

# Score distribution (third row, left)
ax3 = fig.add_subplot(gs[2, :2])
ax3.hist(students['Average_Score'], bins=20, alpha=0.7, color='orange', edgecolor='black')
ax3.axvline(class_average, color='red', linestyle='--', linewidth=2, label=f'Mean: {class_average:.1f}')
ax3.set_title('Score Distribution', fontweight='bold', fontsize=12)
ax3.set_xlabel('Average Score')
ax3.set_ylabel('Number of Students')
ax3.legend()

# Performance by grade level (third row, right)
ax4 = fig.add_subplot(gs[2, 2:])
grade_level_avg.plot(kind='bar', ax=ax4, color='lightcoral')
ax4.set_title('Performance by Grade Level', fontweight='bold', fontsize=12)
ax4.set_ylabel('Average Score')
ax4.tick_params(axis='x', rotation=45)

# Subject correlation heatmap (fourth row, left)
ax5 = fig.add_subplot(gs[3, :2])
correlation_matrix = students[subjects].corr()
im = ax5.imshow(correlation_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
ax5.set_xticks(range(len(subjects)))
ax5.set_yticks(range(len(subjects)))
ax5.set_xticklabels(subjects)
ax5.set_yticklabels(subjects)
ax5.set_title('Subject Correlations', fontweight='bold', fontsize=12)

# Add correlation values
for i in range(len(subjects)):
    for j in range(len(subjects)):
        ax5.text(j, i, f'{correlation_matrix.iloc[i, j]:.2f}', 
                ha='center', va='center', color='white', fontweight='bold')

# Performance groups comparison (fourth row, right)
ax6 = fig.add_subplot(gs[3, 2:])
performance_groups = ['Top 10%', 'Middle 80%', 'Bottom 10%']
performance_scores = [top_10_pct, middle_80_pct, bottom_10_pct]
colors_perf = ['green', 'yellow', 'red']

bars = ax6.bar(performance_groups, performance_scores, color=colors_perf, alpha=0.7)
ax6.set_title('Performance by Student Groups', fontweight='bold', fontsize=12)
ax6.set_ylabel('Average Score')

# Add value labels
for bar, score in zip(bars, performance_scores):
    ax6.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, 
            f'{score:.1f}', ha='center', va='bottom', fontweight='bold')

plt.show()

print("üìä Executive education dashboard created successfully!")
print("\n" + "="*70)
print("üéâ STUDENT PERFORMANCE ANALYSIS PROJECT COMPLETED!")
print("="*70)
print("\n‚úÖ What you accomplished:")
print("   ‚Ä¢ Comprehensive statistical analysis of student performance")
print("   ‚Ä¢ Subject-wise performance evaluation")
print("   ‚Ä¢ Grade level and demographic analysis")
print("   ‚Ä¢ Statistical significance testing (ANOVA, t-tests)")
print("   ‚Ä¢ Identification of top performers and at-risk students")
print("   ‚Ä¢ Correlation analysis between subjects")
print("   ‚Ä¢ Educational insights and recommendations")
print("   ‚Ä¢ Executive dashboard for stakeholders")
print("\nüöÄ Advanced skills mastered:")
print("   ‚Ä¢ Hypothesis testing and statistical inference")
print("   ‚Ä¢ Educational data mining techniques")
print("   ‚Ä¢ Performance benchmarking and analysis")
print("   ‚Ä¢ Advanced data visualization")
print("   ‚Ä¢ Business intelligence for education")
print("\nüéØ Next challenges:")
print("   ‚Ä¢ Try analyzing your own datasets")
print("   ‚Ä¢ Explore machine learning for predictive analytics")
print("   ‚Ä¢ Build interactive dashboards with Plotly")
print("   ‚Ä¢ Create automated reporting systems")
print("\nCongratulations! You're now ready for advanced data science projects! üéì‚ú®")