# setup

## setup github api

In [None]:
from google.colab import userdata
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')

In [None]:
# GitHub API authentication
if not GITHUB_TOKEN:
    raise ValueError("Please set GITHUB_TOKEN environment variable")

headers = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'Accept': 'application/vnd.github.v3+json'
}

In [None]:
TREATMENT_URLS_PATH = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/treatment_present/hn_rq3_repos_treatment_v6_unduplicated.csv"
TREATMENT_CHECKPOINT_PATH = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/treatment_historical"

OUTPUT_DIR = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/treatment_historical/hn_rq3_historical"

# v1 scatterplots

## v1

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

def load_and_clean_metric(file_path):
    """Load metric CSV and convert dates to datetime."""
    df = pd.read_csv(file_path, index_col=0)
    # Convert column names to datetime
    df.columns = pd.to_datetime(df.columns)
    return df

def plot_raw_metrics(metrics_dir, output_dir):
    """Create scatterplots of raw metrics."""
    metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/{metric}_metrics.csv")

        # Plot each repository
        for repo in df.index:
            data = df.loc[repo]
            valid_data = data[data != -1]  # Remove -1 values
            if not valid_data.empty:
                axes[idx].scatter(valid_data.index, valid_data.values,
                                alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(metric.capitalize())
        axes[idx].grid(True, alpha=0.3)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/raw_metrics.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_percentage_changes(metrics_dir, output_dir):
    """Create scatterplots of percentage changes in metrics."""
    metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/{metric}_metrics.csv")

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            valid_mask = data != -1
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage change
                pct_change = valid_data.pct_change() * 100

                # Remove infinite values that can occur when dividing by zero
                pct_change = pct_change[~np.isinf(pct_change)]

                # Plot with reasonable y-axis limits
                valid_pct = pct_change[np.abs(pct_change) <= 100]  # Filter out extreme values
                if not valid_pct.empty:
                    axes[idx].scatter(valid_pct.index, valid_pct.values,
                                    alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Percentage Change Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(f'% Change in {metric.capitalize()}')
        axes[idx].grid(True, alpha=0.3)

        # Set reasonable y-axis limits
        axes[idx].set_ylim(-10, 110)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/percentage_changes.png", dpi=300, bbox_inches='tight')
    plt.close()


## v2 - normal bar plot (mean and stdev)

- This version calculates the mean and standard deviation for each month and plots them using bars with error bars (std deviation).

- This visualization highlights the central tendency and spread but might not fully capture the distribution shape (e.g., skewness, percentiles).

- Std deviation can still show some variation even if the actual distribution is skewed.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

def load_and_clean_metric(file_path):
    """Load metric CSV and convert dates to datetime."""
    df = pd.read_csv(file_path, index_col=0)
    # Convert column names to datetime
    df.columns = pd.to_datetime(df.columns)
    return df

def plot_monthly_stats(metrics_dir, output_dir):
    """Create bar plots of monthly percentage changes."""
    metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/{metric}_metrics.csv")

        # Create dictionary to store monthly percentage changes
        monthly_changes = {}

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage changes
                pct_change = valid_data.pct_change() * 100
                # Remove infinite values
                pct_change = pct_change[~np.isinf(pct_change)]
                # Filter extreme values
                pct_change = pct_change[np.abs(pct_change) <= 100]

                for date, value in pct_change.items():
                    month_key = date.strftime('%Y-%m')
                    if month_key not in monthly_changes:
                        monthly_changes[month_key] = []
                    monthly_changes[month_key].append(value)

        # Calculate mean and std for each month
        months = sorted(monthly_changes.keys())
        means = []
        stds = []

        for month in months:
            values = monthly_changes[month]
            if values:  # Check if we have values for this month
                means.append(np.mean(values))
                stds.append(np.std(values))

        # Convert months to datetime for better x-axis formatting
        month_dates = pd.to_datetime([month + '-15' for month in months])

        # Plot bars with error bars
        bars = axes[idx].bar(month_dates, means, alpha=0.6,
                           yerr=stds, capsize=5,
                           color='skyblue', ecolor='navy')

        axes[idx].set_title(f'Monthly Average Percentage Change in {metric.capitalize()}')
        axes[idx].set_xlabel('Month')
        axes[idx].set_ylabel(f'Average Monthly % Change')
        axes[idx].grid(True, alpha=0.3)

        # Set reasonable y-axis limits
        axes[idx].set_ylim(-10, 110)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

        # Inside the main plotting loop, replace the text label section with:
        for i, (bar, mean, std) in enumerate(zip(bars, means, stds)):
            height = bar.get_height()
            # Alternate label positions above/below
            if i % 2 == 0:
                y_pos = height + std + 5  # Above bar
                va = 'bottom'
            else:
                y_pos = height - 5  # Below bar top
                va = 'top'

            # Shorter label format
            axes[idx].text(bar.get_x() + bar.get_width()/2, y_pos,
                        f'{mean:.0f}±{std:.0f}%',
                        ha='center', va=va, fontsize=8)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/monthly_percentage_stats.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_raw_metrics(metrics_dir, output_dir):
    """Create scatterplots of raw metrics."""
    metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/{metric}_metrics.csv")

        # Plot each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_data = data[(data != -1) & (data != -2)]
            if not valid_data.empty:
                axes[idx].scatter(valid_data.index, valid_data.values,
                                alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(metric.capitalize())
        axes[idx].grid(True, alpha=0.3)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/raw_metrics.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_percentage_changes(metrics_dir, output_dir):
    """Create scatterplots of percentage changes in metrics."""
    metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/{metric}_metrics.csv")

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage change
                pct_change = valid_data.pct_change() * 100

                # Remove infinite values that can occur when dividing by zero
                pct_change = pct_change[~np.isinf(pct_change)]

                # Plot with reasonable y-axis limits
                valid_pct = pct_change[np.abs(pct_change) <= 100]  # Filter out extreme values
                if not valid_pct.empty:
                    axes[idx].scatter(valid_pct.index, valid_pct.values,
                                    alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Percentage Change Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(f'% Change in {metric.capitalize()}')
        axes[idx].grid(True, alpha=0.3)

        # Set reasonable y-axis limits
        axes[idx].set_ylim(-10, 110)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/percentage_changes.png", dpi=300, bbox_inches='tight')
    plt.close()

## v3 - proper boxplot (median q1 q3)

- The box plot shows the quartiles (Q1, median, Q3), whiskers, and outliers but ignores the mean and std deviation directly.

- Since outliers are hidden (showfliers=False), the later months might appear to have no deviation even if the raw data had some fluctuations.

- If the data for later months has fewer points or is more concentrated, it could visually appear as if there is no spread at all in V2.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']
# metrics = ['pull_requests', 'commits']

def load_and_clean_metric(file_path):
    """Load metric CSV and convert dates to datetime."""
    df = pd.read_csv(file_path, index_col=0)
    # Convert column names to datetime
    df.columns = pd.to_datetime(df.columns)
    return df

def plot_monthly_stats(metrics_dir, output_dir):
    """Create box plots of monthly percentage changes."""
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 15*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/combined_{metric}_metrics.csv")

        # Create dictionary to store monthly percentage changes
        monthly_changes = {}

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage changes
                pct_change = valid_data.pct_change() * 100
                # Remove infinite values
                pct_change = pct_change[~np.isinf(pct_change)]
                # Filter extreme values
                pct_change = pct_change[np.abs(pct_change) <= 100]

                for date, value in pct_change.items():
                    month_key = date.strftime('%Y-%m')
                    if month_key not in monthly_changes:
                        monthly_changes[month_key] = []
                    monthly_changes[month_key].append(value)

        # Prepare data for box plot
        months = sorted(monthly_changes.keys())
        box_data = [monthly_changes[month] for month in months]

        # Convert months to datetime for better x-axis formatting
        month_dates = pd.to_datetime([month + '-15' for month in months])

        # Create box plot
        bp = axes[idx].boxplot(box_data,
                             positions=range(len(months)),
                             widths=0.7,
                             patch_artist=True,
                             showfliers=False)  # Hide outlier points for cleaner visualization

        # Customize box plot appearance
        plt.setp(bp['boxes'], facecolor='skyblue', alpha=0.6)
        plt.setp(bp['medians'], color='navy')
        plt.setp(bp['whiskers'], color='navy')
        plt.setp(bp['caps'], color='navy')

        # Set x-axis labels and format
        axes[idx].set_xticks(range(len(months)))
        axes[idx].set_xticklabels([date.strftime('%Y-%m') for date in month_dates])

        axes[idx].set_title(f'Monthly Distribution of Percentage Changes in {metric.capitalize()}')
        axes[idx].set_xlabel('Month')
        axes[idx].set_ylabel(f'Monthly % Change')
        axes[idx].grid(True, alpha=0.3)

        # Set reasonable y-axis limits
        axes[idx].set_ylim(0, 100)
        axes[idx].set_yticks(range(0, 100, 5))
        axes[idx].grid(True, alpha=0.5)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

        # Add summary statistics as text annotations
        # for i, (month, data) in enumerate(zip(months, box_data)):
        #     if data:  # Check if we have data for this month
        #         median = np.median(data)
        #         q1 = np.percentile(data, 25)
        #         q3 = np.percentile(data, 75)

        #         # Position text above the box
        #         axes[idx].text(i, axes[idx].get_ylim()[1] - 5,
        #                      f'M:{median:.0f}\nQ1:{q1:.0f}\nQ3:{q3:.0f}',
        #                      ha='center', va='top', fontsize=8)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/monthly_boxplot_stats.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_raw_metrics(metrics_dir, output_dir):
    """Create scatterplots of raw metrics."""
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/combined_{metric}_metrics.csv")

        # Plot each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_data = data[(data != -1) & (data != -2)]
            if not valid_data.empty:
                axes[idx].scatter(valid_data.index, valid_data.values,
                                alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(metric.capitalize())
        axes[idx].grid(True, alpha=0.3)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/raw_metrics.png", dpi=300, bbox_inches='tight')
    plt.close()

def plot_percentage_changes(metrics_dir, output_dir):
    """Create scatterplots of percentage changes in metrics."""
    fig, axes = plt.subplots(len(metrics), 1, figsize=(15, 5*len(metrics)))

    for idx, metric in enumerate(metrics):
        df = load_and_clean_metric(f"{metrics_dir}/combined_{metric}_metrics.csv")

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage change
                pct_change = valid_data.pct_change() * 100

                # Remove infinite values that can occur when dividing by zero
                pct_change = pct_change[~np.isinf(pct_change)]

                # Plot with reasonable y-axis limits
                valid_pct = pct_change[np.abs(pct_change) <= 100]  # Filter out extreme values
                if not valid_pct.empty:
                    axes[idx].scatter(valid_pct.index, valid_pct.values,
                                    alpha=0.3, s=20)

        axes[idx].set_title(f'{metric.capitalize()} Percentage Change Over Time')
        axes[idx].set_xlabel('Date')
        axes[idx].set_ylabel(f'% Change in {metric.capitalize()}')
        axes[idx].grid(True, alpha=0.3)

        # Set reasonable y-axis limits
        axes[idx].set_ylim(-10, 110)

        # Rotate x-axis labels for better readability
        plt.setp(axes[idx].xaxis.get_majorticklabels(), rotation=45)

    plt.tight_layout()
    plt.savefig(f"{output_dir}/percentage_changes.png", dpi=300, bbox_inches='tight')
    plt.close()

# run

In [None]:
METRICS_DIR = "."  # Directory containing the metric CSVs
OUTPUT_DIR = "./output"    # Directory to save the plots

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

plt.rcParams.update({'font.size': 22})

# Generate both types of plots
plot_raw_metrics(METRICS_DIR, OUTPUT_DIR)
plot_percentage_changes(METRICS_DIR, OUTPUT_DIR)
plot_monthly_stats(METRICS_DIR, OUTPUT_DIR)

# v2

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import os
import random
from matplotlib.colors import hsv_to_rgb
import matplotlib as mpl

# Set font size for better readability
plt.rcParams.update({'font.size': 20})
fontsize = 20

ticks = 5

# Define metrics
metrics = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']

# Function to load and clean metric data
def load_and_clean_metric(file_path):
    """Load metric CSV and convert dates to datetime."""
    df = pd.read_csv(file_path, index_col=0)
    # Convert column names to datetime
    df.columns = pd.to_datetime(df.columns)
    return df

# Function to generate unique colors for repositories
def generate_repo_colors(repos):
    """Generate unique colors for each repository."""
    num_repos = len(repos)
    colors = {}

    for i, repo in enumerate(repos):
        # Generate HSV colors with good saturation and value
        h = i / num_repos
        s = 0.7 + 0.3 * random.random()  # High saturation
        v = 0.7 + 0.3 * random.random()  # High value
        color = hsv_to_rgb((h, s, v))
        colors[repo] = color

    return colors

# 1. Raw Metric Scatterplot
def plot_raw_metrics(metrics_dir, output_dir):
    """Create scatterplots of raw metrics with different colors per repo."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_data = data[(data != -1) & (data != -2)]
            if not valid_data.empty:
                plt.scatter(valid_data.index, valid_data.values,
                           alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'{metric.capitalize()} Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(metric.capitalize(), fontsize=fontsize)
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/raw_{metric}_metrics.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 2. Raw Metric Changes Scatterplot
def plot_raw_changes(metrics_dir, output_dir):
    """Create scatterplots of raw changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate absolute change
                changes = valid_data.diff()

                # Remove NaN values that occur in the first position
                changes = changes.dropna()

                if not changes.empty:
                    plt.scatter(changes.index, changes.values,
                               alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'Absolute Change in {metric.capitalize()} Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(f'Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)
        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/raw_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 3. Percentage Change Metric Scatterplot
def plot_percentage_changes(metrics_dir, output_dir):
    """Create scatterplots of percentage changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)
        repo_colors = generate_repo_colors(df.index)

        plt.figure(figsize=(8, 4))

        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage change
                pct_change = valid_data.pct_change() * 100

                # Remove infinite values and NaN
                pct_change = pct_change[~np.isinf(pct_change)]
                pct_change = pct_change.dropna()

                # Filter to reasonable range
                valid_pct = pct_change[pct_change >= -10]

                if not valid_pct.empty:
                    plt.scatter(valid_pct.index, valid_pct.values,
                               alpha=0.6, s=20, color=repo_colors[repo])

        # plt.title(f'{metric.capitalize()} Percentage Change Over Time', fontsize=fontsize)
        plt.xlabel('Date', fontsize=fontsize)
        plt.ylabel(f'% Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set y-axis to show from -10% to maximum
        plt.ylim(bottom=-10)

        plt.xticks(rotation=45)
        plt.locator_params(axis='x', nbins=5)
        plt.tight_layout()

        output_file = f"{output_dir}/percentage_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 4. Monthly Boxplot of Raw Metrics
def plot_monthly_raw_boxplots(metrics_dir, output_dir):
    """Create box plots of raw metrics by month."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Transpose to have dates as rows and repos as columns
        df_t = df.T

        # Replace -1 and -2 with NaN
        df_t = df_t.replace([-1, -2], np.nan)

        # Group by month
        df_t['month'] = df_t.index.strftime('%Y-%m')
        monthly_data = {}

        for month, group in df_t.groupby('month'):
            # Extract all valid values for this month across all repos
            values = group.iloc[:, :-1].values.flatten()
            values = values[~np.isnan(values)]
            if len(values) > 0:
                monthly_data[month] = values

        # Prepare data for boxplot
        months = sorted(monthly_data.keys())
        box_data = [monthly_data[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='skyblue', alpha=0.6)
        plt.setp(bp['medians'], color='navy', linewidth=2)
        plt.setp(bp['whiskers'], color='navy', linewidth=2)
        plt.setp(bp['caps'], color='navy', linewidth=2)

        # plt.title(f'Monthly Distribution of {metric.capitalize()} Values', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(metric.capitalize(), fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_raw_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 5. Monthly Boxplot of Raw Metric Changes
def plot_monthly_changes_boxplots(metrics_dir, output_dir):
    """Create box plots of monthly changes in raw metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Create dictionary to store monthly changes
        monthly_changes = {}

        # Calculate changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate absolute changes
                changes = valid_data.diff().dropna()

                for date, value in changes.items():
                    month_key = date.strftime('%Y-%m')
                    if month_key not in monthly_changes:
                        monthly_changes[month_key] = []
                    monthly_changes[month_key].append(value)

        # Prepare data for boxplot
        months = sorted(monthly_changes.keys())
        box_data = [monthly_changes[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='lightgreen', alpha=0.6)
        plt.setp(bp['medians'], color='darkgreen', linewidth=2)
        plt.setp(bp['whiskers'], color='darkgreen', linewidth=2)
        plt.setp(bp['caps'], color='darkgreen', linewidth=2)

        # plt.title(f'Monthly Distribution of Changes in {metric.capitalize()}', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(f'Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")

# 6. Monthly Boxplot of Percentage Changes
def plot_monthly_pct_changes_boxplots(metrics_dir, output_dir):
    """Create box plots of monthly percentage changes in metrics."""
    for metric in metrics:
        file_path = f"{metrics_dir}/rq3_combined_{metric}_metrics.csv"
        if not os.path.exists(file_path):
            print(f"Warning: File {file_path} not found, skipping metric {metric}")
            continue

        df = load_and_clean_metric(file_path)

        # Create dictionary to store monthly percentage changes
        monthly_changes = {}

        # Calculate percentage changes for each repository
        for repo in df.index:
            data = df.loc[repo]
            # Remove -1 and -2 values
            valid_mask = (data != -1) & (data != -2)
            valid_data = data[valid_mask]

            if len(valid_data) > 1:
                # Calculate percentage changes
                pct_change = valid_data.pct_change() * 100
                # Remove infinite values
                pct_change = pct_change[~np.isinf(pct_change)]
                # Filter to show from -10% up
                pct_change = pct_change[pct_change >= -10]

                for date, value in pct_change.items():
                    if not np.isnan(value):
                        month_key = date.strftime('%Y-%m')
                        if month_key not in monthly_changes:
                            monthly_changes[month_key] = []
                        monthly_changes[month_key].append(value)

        # Prepare data for boxplot
        months = sorted(monthly_changes.keys())
        box_data = [monthly_changes[month] for month in months]

        # Create boxplot
        plt.figure(figsize=(8, 4))

        bp = plt.boxplot(box_data,
                          positions=range(len(months)),
                          widths=0.7,
                          patch_artist=True,
                          showfliers=False)

        # Customize boxplot appearance
        plt.setp(bp['boxes'], facecolor='salmon', alpha=0.6)
        plt.setp(bp['medians'], color='darkred', linewidth=2)
        plt.setp(bp['whiskers'], color='darkred', linewidth=2)
        plt.setp(bp['caps'], color='darkred', linewidth=2)

        # plt.title(f'Monthly Distribution of Percentage Changes in {metric.capitalize()}', fontsize=fontsize)
        plt.xlabel('Month', fontsize=fontsize)
        plt.ylabel(f'% Change in {metric.capitalize()}', fontsize=fontsize)
        plt.grid(True, alpha=0.3)

        # Set y-axis to show from -10% to maximum
        plt.ylim(bottom=-10)

        # Set x-axis labels
        plt.xticks(range(len(months)), months, rotation=45)
        plt.locator_params(axis='x', nbins=5)

        plt.tight_layout()

        output_file = f"{output_dir}/monthly_boxplot_pct_changes_{metric}.png"
        plt.savefig(output_file, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Saved {output_file}")


In [16]:
# Set up directory paths
METRICS_DIR = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/csv"  # Directory containing the metric CSVs
OUTPUT_DIR = "/content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3"  # Directory to save the plots

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Generate all plots
print("Generating raw metric scatterplots...")
plot_raw_metrics(METRICS_DIR, OUTPUT_DIR)

print("Generating raw metric changes scatterplots...")
plot_raw_changes(METRICS_DIR, OUTPUT_DIR)

print("Generating percentage change scatterplots...")
plot_percentage_changes(METRICS_DIR, OUTPUT_DIR)

print("Generating monthly boxplots of raw metrics...")
plot_monthly_raw_boxplots(METRICS_DIR, OUTPUT_DIR)

print("Generating monthly boxplots of raw metric changes...")
plot_monthly_changes_boxplots(METRICS_DIR, OUTPUT_DIR)

print("Generating monthly boxplots of percentage changes...")
plot_monthly_pct_changes_boxplots(METRICS_DIR, OUTPUT_DIR)

print("All visualizations completed!")

Generating raw metric scatterplots...


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_stars_metrics.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_commits_metrics.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_pull_requests_metrics.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_forks_metrics.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_contributors_metrics.png
Generating raw metric changes scatterplots...


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_changes_stars.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_changes_commits.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_changes_pull_requests.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_changes_forks.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/raw_changes_contributors.png
Generating percentage change scatterplots...


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/percentage_changes_stars.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/percentage_changes_commits.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/percentage_changes_pull_requests.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/percentage_changes_forks.png


  plt.locator_params(axis='x', nbins=5)


Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/percentage_changes_contributors.png
Generating monthly boxplots of raw metrics...
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_raw_stars.png
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_raw_commits.png
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_raw_pull_requests.png
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_raw_forks.png
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_raw_contributors.png
Generating monthly boxplots of raw metric changes...
Saved /content/drive/MyDrive/datasets/muict-naist-senior/rq3/rq3_historical_metrics/graphs/v3/monthly_boxplot_changes_stars.png
Saved 

In [None]:
!zip -r output.zip /output