In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.io as pio

# Load the CSV data
csv_path = "data/aggregated/school_quality_ratings.csv"
df = pd.read_csv(csv_path)

# --- Visualization 1: Bubble Plot of ZIP Codes by Average Quality Score (or Enhanced Heatmap Fallback) ---
# Calculate average Quality Score per ZIP code
zip_scores = df.groupby('ZIP Code')['Quality Score'].mean().reset_index()
zip_scores['ZIP Code'] = zip_scores['ZIP Code'].astype(str)

# Approximate coordinates for Atlanta ZIP codes (manually estimated for key ZIPs in the CSV)
# These are rough lat/lon centroids for visualization, not precise
zip_coords = {
    '30288': [33.645, -84.302], '30344': [33.688, -84.444], '30319': [33.885, -84.336], '30305': [33.837, -84.382],
    '30331': [33.722, -84.511], '30315': [33.703, -84.383], '30316': [33.722, -84.341], '30311': [33.724, -84.451],
    '30337': [33.652, -84.444], '30002': [33.774, -84.260], '30318': [33.786, -84.425], '30313': [33.765, -84.395],
    '30034': [33.698, -84.247], '30310': [33.727, -84.418], '30317': [33.749, -84.316], '30030': [33.771, -84.295],
    '30033': [33.812, -84.281], '30324': [33.816, -84.356], '30314': [33.754, -84.415], '30354': [33.672, -84.391],
    '30312': [33.746, -84.377], '30307': [33.771, -84.336], '30339': [33.875, -84.467], '30327': [33.862, -84.419],
    '30308': [33.771, -84.381], '30342': [33.884, -84.378], '30306': [33.786, -84.351], '30329': [33.823, -84.325],
    '30345': [33.850, -84.286], '30032': [33.736, -84.260], '30309': [33.794, -84.389], '30079': [33.810, -84.219],
    '30303': [33.752, -84.391]
}
# Filter for ZIPs in the data and add coordinates
zip_scores['lat'] = zip_scores['ZIP Code'].map(lambda x: zip_coords.get(x, [None, None])[0])
zip_scores['lon'] = zip_scores['ZIP Code'].map(lambda x: zip_coords.get(x, [None, None])[1])
zip_scores = zip_scores.dropna(subset=['lat', 'lon'])  # Drop any ZIPs without coords

# Attempt bubble plot with Plotly
try:
    fig = px.scatter(
        zip_scores,
        x='lon', y='lat',
        size='Quality Score',
        color='Quality Score',
        color_continuous_scale='YlOrRd',
        hover_data=['ZIP Code', 'Quality Score'],
        size_max=20,
        opacity=0.7,
        title='Average School Quality Score by ZIP Code in Atlanta Area'
    )
    fig.update_layout(
        xaxis_title='Longitude', yaxis_title='Latitude',
        margin={'r': 0, 't': 50, 'l': 0, 'b': 0},
        showlegend=True,
        width=800, height=600
    )
    # Add a simple background to mimic a map
    fig.update_xaxes(range=[-84.6, -84.1])
    fig.update_yaxes(range=[33.6, 34.0])
    # Save as PNG
    pio.write_image(fig, file='visualizations/zip_code_quality_score_bubble.png', format='png')
except Exception as e:
    print(f"Bubble plot failed: {e}. Using enhanced heatmap fallback.")
    # Enhanced Heatmap: Sorted ZIP codes, improved aesthetics
    zip_scores_sorted = zip_scores.sort_values('Quality Score', ascending=False)
    fig, ax = plt.subplots(figsize=(14, 8))
    sns.heatmap(
        zip_scores_sorted.pivot_table(values='Quality Score', index='ZIP Code'),
        cmap='YlOrRd', annot=True, fmt='.1f', cbar_kws={'label': 'Average Quality Score'},
        annot_kws={'size': 12}, linewidths=0.5
    )
    ax.set_title('Average School Quality Score by ZIP Code', fontsize=16, pad=20)
    ax.set_xlabel('ZIP Code', fontsize=12)
    ax.set_ylabel('', fontsize=12)
    plt.tight_layout()
    plt.savefig('visualizations/zip_code_quality_score_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()

# --- Visualization 2: Bar Plot of Letter Grade Distribution ---
grade_counts = df['Letter Grade'].value_counts().sort_index()
fig, ax = plt.subplots(figsize=(8, 6))
sns.barplot(x=grade_counts.index, y=grade_counts.values, hue=grade_counts.index, palette='viridis', legend=False, ax=ax)
ax.set_title('Distribution of School Letter Grades', fontsize=14)
ax.set_xlabel('Letter Grade', fontsize=12)
ax.set_ylabel('Number of Schools', fontsize=12)
plt.tight_layout()
plt.savefig('visualizations/letter_grade_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# --- Visualization 3: Scatter Plot of Academic Score vs. Outcomes Score ---
fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(data=df, x='Academic Score', y='Outcomes Score', hue='Letter Grade',
                size='Quality Score', sizes=(20, 200), palette='coolwarm', ax=ax)
ax.set_title('Academic Score vs. Outcomes Score by Letter Grade', fontsize=14)
ax.set_xlabel('Academic Score', fontsize=12)
ax.set_ylabel('Outcomes Score', fontsize=12)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('visualizations/academic_vs_outcomes_scatter.png', dpi=300, bbox_inches='tight')
plt.close()

# --- Visualization 4: Box Plot of Quality Scores by ZIP Code ---
top_zips = df['ZIP Code'].value_counts().head(10).index
df_top_zips = df[df['ZIP Code'].isin(top_zips)]
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(x='ZIP Code', y='Quality Score', hue='ZIP Code', palette='Set2', legend=False, data=df_top_zips, ax=ax)
ax.set_title('Quality Score Distribution for Top 10 ZIP Codes', fontsize=14)
ax.set_xlabel('ZIP Code', fontsize=12)
ax.set_ylabel('Quality Score', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('visualizations/quality_score_by_zip_boxplot.png', dpi=300, bbox_inches='tight')
plt.close()