## Hierarchical Language Relationships

We'll create several hierarchical visualizations to explore relationships between languages:
1. Sunburst chart showing language categories and metrics
2. Treemap visualization of repository distributions
3. Hierarchical clustering dendrogram based on language similarities

In [2]:
# Create hierarchical category structure for languages
def create_language_hierarchy(df):
    # Calculate metrics for each language
    hierarchy_data = df.groupby('language').agg({
        'stars': 'sum',
        'forks': 'sum',
        'watchers': 'sum',
        'repository_id': 'count'  # Count of repositories
    }).reset_index()
    
    # Categorize languages based on paradigms/features
    language_categories = {
        'Systems Programming': ['C++', 'Rust', 'Go'],
        'Web Development': ['JavaScript', 'TypeScript'],
        'Enterprise': ['Java', 'C#'],
        'Mobile & Modern': ['Kotlin', 'Swift'],
        'Scripting': ['Python', 'Ruby', 'PHP']
    }
    
    # Create data for sunburst chart
    sunburst_data = []
    for category, langs in language_categories.items():
        # Add category level
        category_metrics = hierarchy_data[hierarchy_data['language'].isin(langs)].sum()
        sunburst_data.append({
            'id': category,
            'parent': '',
            'value': category_metrics['repository_id'],
            'stars': category_metrics['stars'],
            'forks': category_metrics['forks']
        })
        
        # Add language level
        for lang in langs:
            lang_metrics = hierarchy_data[hierarchy_data['language'] == lang].iloc[0]
            sunburst_data.append({
                'id': lang,
                'parent': category,
                'value': lang_metrics['repository_id'],
                'stars': lang_metrics['stars'],
                'forks': lang_metrics['forks']
            })
    
    return pd.DataFrame(sunburst_data)

# Create and display sunburst chart
hierarchy_df = create_language_hierarchy(df)

fig = go.Figure(go.Sunburst(
    ids=hierarchy_df['id'],
    parents=hierarchy_df['parent'],
    values=hierarchy_df['value'],
    branchvalues='total',
    hovertemplate='<b>%{label}</b><br>' +
                  'Repositories: %{value}<br>' +
                  'Stars: %{customdata[0]:,.0f}<br>' +
                  'Forks: %{customdata[1]:,.0f}<extra></extra>',
    customdata=hierarchy_df[['stars', 'forks']].values
))

fig.update_layout(
    title={
        'text': 'Programming Language Hierarchy by Repository Count',
        'y': 0.95,
        'x': 0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    width=800,
    height=800
)

# Save the figure to HTML for interactivity
fig.write_html('language_hierarchy_sunburst.html')
fig.show()

NameError: name 'df' is not defined

In [None]:
# Create treemap visualization
def create_repository_treemap(df):
    # Calculate repository metrics by language
    treemap_data = df.groupby('language').agg({
        'repository_id': 'count',
        'stars': 'sum',
        'forks': 'sum',
        'watchers': 'sum',
        'commits_30d': 'mean'
    }).reset_index()
    
    # Calculate percentages for hover text
    total_repos = treemap_data['repository_id'].sum()
    treemap_data['percentage'] = (treemap_data['repository_id'] / total_repos * 100)
    
    # Sort by repository count to ensure larger boxes for more popular languages
    treemap_data = treemap_data.sort_values('repository_id', ascending=False)
    
    fig = px.treemap(
        treemap_data,
        path=['language'],
        values='repository_id',
        color='stars',
        color_continuous_scale='Viridis',
        custom_data=['percentage', 'stars', 'forks', 'commits_30d']
    )
    
    fig.update_traces(
        hovertemplate='<b>%{label}</b><br>' +
        'Repositories: %{value:,.0f}<br>' +
        'Percentage: %{customdata[0]:.1f}%<br>' +
        'Total Stars: %{customdata[1]:,.0f}<br>' +
        'Total Forks: %{customdata[2]:,.0f}<br>' +
        'Avg Monthly Commits: %{customdata[3]:.1f}<extra></extra>'
    )
    
    fig.update_layout(
        title={
            'text': 'Repository Distribution Across Languages',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        width=1000,
        height=600
    )
    
    # Save interactive version
    fig.write_html('treemap_top_repos.html')
    return fig

# Create and display treemap
treemap_fig = create_repository_treemap(df)
treemap_fig.show()

## Interactive Language Explorer

In this section, we'll create interactive visualizations that allow for dynamic exploration of language relationships and patterns:
1. Parallel coordinates plot for multi-dimensional analysis
2. 3D scatter plot for exploring relationships between key metrics
3. Interactive gauge charts for comparing language performance

In [None]:
# Create parallel coordinates plot for top repositories
def create_parallel_coordinates(df, top_n=100):
    # Select metrics for parallel coordinates
    metrics = ['stars', 'forks', 'watchers', 'open_issues', 'commits_30d', 'contributors_count']
    
    # Get top N repositories by stars
    top_repos = df.nlargest(top_n, 'stars')
    
    # Create parallel coordinates plot
    fig = go.Figure(data=
        go.Parcoords(
            line=dict(
                color=top_repos['stars'],
                colorscale='Viridis',
                showscale=True,
                cmin=top_repos['stars'].min(),
                cmax=top_repos['stars'].max()
            ),
            dimensions=[
                dict(range=[top_repos[col].min(), top_repos[col].max()],
                     label=col.replace('_', ' ').title(),
                     values=top_repos[col])
                for col in metrics
            ]
        )
    )
    
    fig.update_layout(
        title={
            'text': f'Multi-dimensional Analysis of Top {top_n} Repositories',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        width=1000,
        height=600
    )
    
    # Save interactive version
    fig.write_html('parallel_coordinates_top100.html')
    return fig

# Create and display parallel coordinates plot
parallel_fig = create_parallel_coordinates(df)
parallel_fig.show()

In [None]:
# Create 3D scatter plot
def create_3d_language_analysis(df):
    # Calculate language-level metrics
    language_metrics = df.groupby('language').agg({
        'stars': 'mean',
        'forks': 'mean',
        'contributors_count': 'mean',
        'repository_id': 'count'
    }).reset_index()
    
    # Create 3D scatter plot
    fig = go.Figure(data=[
        go.Scatter3d(
            x=language_metrics['stars'],
            y=language_metrics['forks'],
            z=language_metrics['contributors_count'],
            text=language_metrics['language'],
            mode='markers+text',
            marker=dict(
                size=language_metrics['repository_id'] / 10,  # Size based on repository count
                color=language_metrics['repository_id'],
                colorscale='Viridis',
                colorbar=dict(title='Repository Count'),
                opacity=0.8
            ),
            hovertemplate=
            '<b>%{text}</b><br>' +
            'Avg Stars: %{x:.0f}<br>' +
            'Avg Forks: %{y:.0f}<br>' +
            'Avg Contributors: %{z:.1f}<br>' +
            '<extra></extra>'
        )
    ])
    
    fig.update_layout(
        title={
            'text': '3D Language Analysis: Stars, Forks, and Contributors',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        scene=dict(
            xaxis_title='Average Stars',
            yaxis_title='Average Forks',
            zaxis_title='Average Contributors'
        ),
        width=1000,
        height=800
    )
    
    # Save interactive version
    fig.write_html('3d_language_analysis.html')
    return fig

# Create and display 3D scatter plot
scatter_3d_fig = create_3d_language_analysis(df)
scatter_3d_fig.show()

In [None]:
# Create gauge charts for top languages
def create_gauge_charts(df, metric='stars', n_languages=4):
    # Calculate average metrics by language
    language_metrics = df.groupby('language')[metric].mean().sort_values(ascending=False)
    top_languages = language_metrics.head(n_languages)
    
    # Create subplot grid
    fig = make_subplots(
        rows=2, cols=2,
        specs=[[{'type': 'indicator'}, {'type': 'indicator'}],
               [{'type': 'indicator'}, {'type': 'indicator'}]],
        subplot_titles=top_languages.index
    )
    
    # Calculate the maximum value for gauge range
    max_value = language_metrics.max()
    
    # Add gauge charts for each language
    positions = [(1,1), (1,2), (2,1), (2,2)]
    for (language, value), (row, col) in zip(top_languages.items(), positions):
        fig.add_trace(
            go.Indicator(
                mode="gauge+number",
                value=value,
                title={'text': language},
                gauge={
                    'axis': {'range': [None, max_value]},
                    'steps': [
                        {'range': [0, max_value/3], 'color': "lightgray"},
                        {'range': [max_value/3, max_value*2/3], 'color': "gray"},
                        {'range': [max_value*2/3, max_value], 'color': "darkgray"}
                    ],
                    'bar': {'color': "darkblue"}
                }
            ),
            row=row, col=col
        )
    
    fig.update_layout(
        title={
            'text': f'Top {n_languages} Languages by Average {metric.replace("_", " ").title()}',
            'y': 0.95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        width=1000,
        height=800
    )
    
    # Save interactive version
    fig.write_html('gauge_charts_top4.html')
    return fig

# Create and display gauge charts for stars
gauge_fig = create_gauge_charts(df, metric='stars')
gauge_fig.show()

# Also create gauge charts for other metrics
metrics = ['forks', 'contributors_count', 'commits_30d']
for metric in metrics:
    create_gauge_charts(df, metric=metric)

# Advanced Visualizations for GitHub Language Analysis

This notebook focuses on creating advanced, interactive visualizations to better understand programming language relationships and patterns in our GitHub data.

## Table of Contents
1. [Setup and Data Loading](#Setup-and-Data-Loading)
2. [Hierarchical Language Relationships](#Hierarchical-Language-Relationships)
3. [Interactive Language Explorer](#Interactive-Language-Explorer)
4. [Repository Success Patterns](#Repository-Success-Patterns)
5. [Temporal Analysis Visualizations](#Temporal-Analysis-Visualizations)
6. [Export and Integration](#Export-and-Integration)

## Goals
- Create interactive and dynamic visualizations
- Explore hierarchical relationships between languages
- Visualize temporal patterns in language usage
- Generate publication-ready visualizations
- Export visualizations for web integration

## Setup and Data Loading

First, let's import necessary libraries and load our prepared data.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# Set default theme for plotly
pio.templates.default = "plotly_white"

# Read the prepared data
df = pd.read_csv('all_languages_combined.csv')

# Display basic info about the dataset
print("Dataset Info:")
print(f"Number of repositories: {len(df)}")
print(f"Number of languages: {df['language'].nunique()}")
print("\nColumns available:")
print(df.columns.tolist())