In [None]:


# ================================================
# CELL 1: Setup and Imports
# ================================================

import sys
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Print system info for debugging (helpful for Jupyter/Colab)
print(f"Python version: {sys.version}")
print(f"Current directory: {os.getcwd()}")
print("-" * 50) 

In [None]:


# ================================================
# CELL 2: Data Loading Function
# ================================================

def load_and_validate_data(csv_file="gender_identity_by_area.csv"):
    """Load CSV from local file or GitHub fallback"""
    try:
        # Try local file first
        if Path(csv_file).exists():
            print("📁 Found local file.")
            df = pd.read_csv(csv_file)
        else:
            # Fall back to GitHub URL
            print("🌐 Local file not found. Trying GitHub...")
            github_url = "https://raw.githubusercontent.com/LDolanLDolan/job-insight-live/master/gender_identity_by_area.csv"
            df = pd.read_csv(github_url)

        print(f"✅ Loaded CSV successfully")
        print(f"📊 Data shape: {df.shape}")
        print(f"📋 Columns: {list(df.columns)}")
        return df

    except Exception as e:
        print(f"❌ Error loading CSV: {e}")
        return None

In [None]:

# ================================================
# CELL 3: Data Summary Function
# ================================================

def display_data_summary(df):
    """Display useful data summary"""
    try:
        print("\n📈 DATA SUMMARY:")
        print("=" * 40)
        
        # Show unique gender categories
        gender_categories = df['Gender identity (8 categories)'].unique()
        print(f"Gender categories ({len(gender_categories)}):")
        for i, category in enumerate(gender_categories, 1):
            print(f"  {i}. {category}")
        
        # Show unique authorities
        authorities = df['Lower Tier Local Authorities'].unique()
        print(f"\nLocal authorities ({len(authorities)}):")
        for i, authority in enumerate(authorities[:5], 1):  # Show first 5
            print(f"  {i}. {authority}")
        if len(authorities) > 5:
            print(f"  ... and {len(authorities) - 5} more")
        
        # Show total observations
        total_obs = df['Observation'].sum()
        print(f"\nTotal observations: {total_obs:,}")
        
    except Exception as e:
        print(f"❌ Error displaying summary: {e}")


In [None]:
# ================================================
# CELL 4: Histogram Visualization Function
# ================================================

def create_histogram(df, save_file="gender_identity_histogram_log.png"):
    """Create and save histogram"""
    try:
        # Filter out 'Does not apply'
        df_filtered = df[df['Gender identity (8 categories)'] != 'Does not apply']
        print(f"📊 Filtered data shape: {df_filtered.shape}")
        
        # Pivot data: gender identity vs authority
        pivot_df = df_filtered.pivot(
            index='Gender identity (8 categories)',
            columns='Lower Tier Local Authorities',
            values='Observation'
        )
        
        # Create bar chart
        plt.figure(figsize=(12, 6))
        ax = pivot_df.plot(kind='bar', figsize=(12, 6), log=True)
        
        plt.title("Gender Identity by Local Authority (log scale)", fontsize=14, fontweight='bold')
        plt.ylabel("Number of People (log scale)")
        plt.xlabel("Gender Identity Category")
        plt.xticks(rotation=45, ha='right')
        plt.legend(title="Local Authority", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        
        # Save the plot
        plt.savefig(save_file, dpi=300, bbox_inches='tight')
        print(f"✅ Histogram saved as {save_file}")
        
        # Show in Jupyter
        plt.show()
        
        return True
        
    except Exception as e:
        print(f"❌ Error creating histogram: {e}")
        return False

In [None]:
# ================================================
# CELL 5: Pie Chart Visualization Function
# ================================================

def create_pie_chart(df, save_file="gender_identity_pie_chart.png"):
    """Create and save pie chart"""
    try:
        # Filter out 'Does not apply'
        df_filtered = df[df['Gender identity (8 categories)'] != 'Does not apply']
        
        # Create summary data
        summary_df = df_filtered.groupby('Gender identity (8 categories)')['Observation'].sum().sort_values(ascending=False)
        
        # Create pie chart
        plt.figure(figsize=(10, 8))
        colors = plt.cm.Set3(range(len(summary_df)))
        
        wedges, texts, autotexts = plt.pie(
            summary_df, 
            labels=summary_df.index, 
            autopct='%1.1f%%', 
            startangle=90, 
            counterclock=False,
            colors=colors
        )
        
        # Improve text readability
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        
        plt.title("Overall Gender Identity Distribution (All Areas)", 
                 fontsize=14, fontweight='bold', pad=20)
        plt.axis('equal')
        
        # Save the plot
        plt.savefig(save_file, dpi=300, bbox_inches='tight')
        print(f"✅ Pie chart saved as {save_file}")
        plt.show()
        
        return True
        
    except Exception as e:
        print(f"❌ Error creating pie chart: {e}")
        return False

In [None]:
# ================================================
# CELL 6: Load Data and Display Summary
# ================================================

print("🚀 Starting Gender Identity Analysis")
print("=" * 50)

# Load and examine the data
df = load_and_validate_data()

if df is not None:
    display_data_summary(df)
else:
    print("❌ Cannot proceed without data")

In [None]:
# ================================================
# CELL 7: Create Histogram
# ================================================

if df is not None:
    print("\n📊 Creating histogram...")
    histogram_success = create_histogram(df)
else:
    print("❌ No data available for histogram")

In [None]:
# ================================================
# CELL 8: Create Pie Chart
# ================================================

if df is not None:
    print("\n🥧 Creating pie chart...")
    pie_success = create_pie_chart(df)
else:
    print("❌ No data available for pie chart")

In [None]:
# ================================================
# CELL 9: Final Results Summary
# ================================================

if df is not None:
    print("\n🎉 Analysis completed!")
    print("📁 Files created:")
    if 'histogram_success' in locals() and histogram_success:
        print("  ✅ gender_identity_histogram_log.png")
    if 'pie_success' in locals() and pie_success:
        print("  ✅ gender_identity_pie_chart.png")
    
    print(f"\n📊 Final data info:")
    print(f"  • Total rows: {len(df):,}")
    print(f"  • Total observations: {df['Observation'].sum():,}")
    print(f"  • Gender categories: {df['Gender identity (8 categories)'].nunique()}")
    print(f"  • Local authorities: {df['Lower Tier Local Authorities'].nunique()}")
else:
    print("❌ Analysis could not be completed - no data loaded")