# graph each metric

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os
import random
from matplotlib.colors import hsv_to_rgb
import matplotlib as mpl

# Set font size for better readability
plt.rcParams.update({'font.size': 20})
fontsize = 20

ticks = 5

# Define metrics
# metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
metrics = ['new_stars', 'new_forks', 'commit_count', 'new_prs', 'active_contributors', 'cumulative_stars', 'cumulative_forks']

# Function to load and clean metric data
# def load_and_clean_metric(file_path):
#     """Load metric CSV and convert dates to datetime."""
#     df = pd.read_csv(file_path, index_col=0)
#     # Convert column names to datetime
#     df.columns = pd.to_datetime(df.columns)
#     return df

def load_and_clean_metric(file_path):
    """Load and prepare the new format of GitHub metrics."""
    df = pd.read_csv(file_path)

    # Convert date strings to datetime
    df['month'] = pd.to_datetime(df['month'])
    df['hn_submission_date'] = pd.to_datetime(df['hn_submission_date'])

    # Pivot the data to have months as columns and repos as rows
    pivot_df = df.pivot_table(
        index='repo_full_name',
        columns='month',
        values=metrics,
        aggfunc='first'  # Use first if there are duplicates
    )

    # Create separate DataFrames for each metric
    metric_dfs = {}
    for metric in metrics:
        metric_dfs[metric] = pivot_df[metric]

    return metric_dfs

# Function to generate unique colors for repositories
def generate_repo_colors(repos):
    """Generate unique colors for each repository."""
    num_repos = len(repos)
    colors = {}

    for i, repo in enumerate(repos):
        # Generate HSV colors with good saturation and value
        h = i / num_repos
        s = 0.7 + 0.3 * random.random()  # High saturation
        v = 0.7 + 0.3 * random.random()  # High value
        color = hsv_to_rgb((h, s, v))
        colors[repo] = color

    return colors

# 1. Raw Metric Scatterplot
def plot_raw_metrics(metrics_dir, metric, output_dir):
    """Create scatterplots of raw metrics with different colors per repo."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            # valid_data = data[(data != -1) & (data != -2)]
            valid_data = data.dropna()
            if not valid_data.empty:
                plt.scatter(valid_data.index, valid_data.values,
                           alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'{metric.capitalize()} Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(metric.capitalize(), fontsize=fontsize)
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/raw_{metric}_metrics.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 2. Raw Metric Changes Scatterplot
def plot_raw_changes(metrics_dir, output_dir):
    """Create scatterplots of raw changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            # valid_mask = (data != -1) & (data != -2)
            valid_mask = data.dropna()
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate absolute change
                changes = valid_data.diff()

                # Remove NaN values that occur in the first position
                changes = changes.dropna()

                if not changes.empty:
                    plt.scatter(changes.index, changes.values,
                               alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'Absolute Change in {metric.capitalize()} Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(f'Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/raw_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 3. Percentage Change Metric Scatterplot
def plot_percentage_changes(metrics_dir, output_dir):
    """Create scatterplots of percentage changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            # valid_mask = (data != -1) & (data != -2)
            valid_mask = data.dropna()
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage change
                pct_change = valid_data.pct_change() * 100

                # Remove infinite values and NaN
                pct_change = pct_change[~np.isinf(pct_change)]
                pct_change = pct_change.dropna()

                # Filter to reasonable range
                valid_pct = pct_change[pct_change >= -10]

                if not valid_pct.empty:
                    plt.scatter(valid_pct.index, valid_pct.values,
                               alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'{metric.capitalize()} Percentage Change Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(f'% Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set y-axis to show from -10% to maximum
        plt.ylim(bottom=-10)

        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/percentage_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 4. Monthly Boxplot of Raw Metrics
def plot_monthly_raw_boxplots(metrics_dir, output_dir):
    """Create box plots of raw metrics by month."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Transpose to have dates as rows and repos as columns
        df_t = df.T

        # Replace -1 and -2 with NaN
        # df_t = df_t.replace([-1, -2], np.nan)

        # Group by month
        df_t['month'] = df_t.index.strftime('%Y-%m')
        monthly_data = {}

        for month, group in df_t.groupby('month'):
            # Extract all valid values for this month across all repos
            values = group.iloc[:, :-1].values.flatten()
            values = values[~np.isnan(values)]
            if len(values) > 0:
                monthly_data[month] = values

        # Prepare data for boxplot
        months = sorted(monthly_data.keys())
        box_data = [monthly_data[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='skyblue', alpha=0.6)
        plt.setp(bp['medians'], color='navy', linewidth=2)
        plt.setp(bp['whiskers'], color='navy', linewidth=2)
        plt.setp(bp['caps'], color='navy', linewidth=2)

        # plt.title(f'Monthly Distribution of {metric.capitalize()} Values', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(metric.capitalize(), fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_raw_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 5. Monthly Boxplot of Raw Metric Changes
def plot_monthly_changes_boxplots(metrics_dir, output_dir):
    """Create box plots of monthly changes in raw metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Create dictionary to store monthly changes
        monthly_changes = {}

        # Calculate changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            # valid_mask = (data != -1) & (data != -2)
            valid_mask = data.dropna()
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate absolute changes
                changes = valid_data.diff().dropna()

                for date, value in changes.items():
                    month_key = date.strftime('%Y-%m')
                    if month_key not in monthly_changes:
                        monthly_changes[month_key] = []
                    monthly_changes[month_key].append(value)

        # Prepare data for boxplot
        months = sorted(monthly_changes.keys())
        box_data = [monthly_changes[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='lightgreen', alpha=0.6)
        plt.setp(bp['medians'], color='darkgreen', linewidth=2)
        plt.setp(bp['whiskers'], color='darkgreen', linewidth=2)
        plt.setp(bp['caps'], color='darkgreen', linewidth=2)

        # plt.title(f'Monthly Distribution of Changes in {metric.capitalize()}', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(f'Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 6. Monthly Boxplot of Percentage Changes
def plot_monthly_pct_changes_boxplots(metrics_dir, output_dir):
    """Create box plots of monthly percentage changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/metrrq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Create dictionary to store monthly percentage changes
        monthly_changes = {}

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            # valid_mask = (data != -1) & (data != -2)
            valid_mask = data.dropna()
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage changes
                pct_change = valid_data.pct_change() * 100
                # Remove infinite values
                pct_change = pct_change[~np.isinf(pct_change)]
                # Filter to show from -10% up
                pct_change = pct_change[pct_change >= -10]

                for date, value in pct_change.items():
                    if not np.isnan(value):
                        month_key = date.strftime('%Y-%m')
                        if month_key not in monthly_changes:
                            monthly_changes[month_key] = []
                        monthly_changes[month_key].append(value)

        # Prepare data for boxplot
        months = sorted(monthly_changes.keys())
        box_data = [monthly_changes[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='salmon', alpha=0.6)
        plt.setp(bp['medians'], color='darkred', linewidth=2)
        plt.setp(bp['whiskers'], color='darkred', linewidth=2)
        plt.setp(bp['caps'], color='darkred', linewidth=2)

        # plt.title(f'Monthly Distribution of Percentage Changes in {metric.capitalize()}', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(f'% Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set y-axis to show from -10% to maximum
        plt.ylim(bottom=-10)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_pct_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")


In [None]:
# Set up directory paths
METRICS_DIR = './hn-stories-gh-ai-metrics.csv'  # Directory containing metric CSV files
OUTPUT_DIR = './graphs'

def plot_metrics(input_csv, output_dir):
    """Main function to handle the new CSV format and generate all plots."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load all metrics at once from the single input CSV
    metric_dfs = load_and_clean_metric(input_csv)

    # Now call the plotting functions with each metric DataFrame
    for metric in metrics:
        plot_raw_metric(metric_dfs[metric], metric, output_dir)
        plot_raw_changes(metric_dfs[metric], metric, output_dir)
        plot_percentage_changes(metric_dfs[metric], metric, output_dir)
        plot_monthly_raw_boxplots(metric_dfs[metric], metric, output_dir)
        plot_monthly_changes_boxplots(metric_dfs[metric], metric, output_dir)
        plot_monthly_pct_changes_boxplots(metric_dfs[metric], metric, output_dir)

# Call the main function
plot_metrics(METRICS_DIR, OUTPUT_DIR)

repo_full_name                                                                  
01-ai/Yi                                            0           0           0   
0ut0flin3/openai-davinci003-python-speech           0           0           0   
0x4D31/galah                                        0           0           0   
0xMesto/chatgptAPI_unof                             0           0           0   
0xStabby/chatgpt-vim                                0           0           0   
...                                               ...         ...         ...   
zinedkaloc/aicomponent.dev                          0           0           0   
zjohn77/lightning-mlflow-hf                         0           0           0   
zkonduit/ezkl                                       0           0           0   
zmoustafa/insulaai                                  0           0           0   
zuccs/disallow-chatgpt                              0           0           0   

month                      