In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from tqdm import tqdm

# Load CSV data
df = pd.read_csv("hn-stories-gh-ai-metrics.csv", parse_dates=["month"])
df.sort_values(by=["repo_full_name", "month"], inplace=True)

# List of metrics to analyze
# metrics = ["stars", ]
metrics = ["stars", "forks", "commits", "PRs", "contributors"]
# metrics = ['cumulative_stars', 'cumulative_forks']

# Drop rows with any 0 values in the metrics
df = df[(df[metrics] != 0).all(axis=1)]

# Drop rows with all 0 in the metrics
# df = df[(df[metrics] != 0).any(axis=1)]

# Drop rows with outlier values in the metrics
# df = df[df[metrics].apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]

# Drop rows with outlier values in the metrics using IQR
Q1 = df[metrics].quantile(0.25)
Q3 = df[metrics].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df[metrics] < (Q1 - 1.5 * IQR)) | (df[metrics] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Print size of the remaining dataset
print(f"Number of repositories: {df['repo_full_name'].nunique()}")

# Convert month to datetime format
df["month"] = pd.to_datetime(df["month"], format="%Y-%m", errors="coerce")
print(df["month"].head())

# Ensure month is sorted
df = df.sort_values(by="month")

# Plot settings
plt.rcParams.update({'font.size': 23})
figsize = (8, 7)
sns.set_context("notebook", font_scale=2.5) # For boxplot

def plot_metrics(df_plot, metric, kind):
    plt.figure(figsize=figsize)

    if kind == "scatter":
        sns.scatterplot(data=df_plot, x="month", y=metric, hue="repo_full_name", palette="tab20", edgecolor=None, alpha=0.5, legend=False)
    elif kind == "box":
        sns.boxplot(data=df_plot, x="month", y=metric, showfliers=False)

    plt.xticks(rotation=45)
    plt.gca().xaxis.set_major_locator(plt.MaxNLocator(6))
    # plt.xticks(rotation=45)
    # plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Major ticks every year
    # plt.gca().xaxis.set_minor_locator(mdates.MonthLocator(interval=2))  # Minor ticks every 2 months
    # plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y %m"))  # Format as "YYYY MM"


    # Remove whitespace in metric text
    metric = metric.replace("_", " ").title()

    plt.ylabel(f"Changes in {metric}")
    plt.xlabel('')
    plt.grid(True)
    plt.savefig(f"rq3_{metric}_{kind}.pdf", bbox_inches='tight')
    plt.close()

# Generate plots
for metric in tqdm(metrics, desc="Generating plots"):
    plot_metrics(df, metric, "scatter")  # Raw metric scatter
    plot_metrics(df, metric, "box")  # Raw metric boxplot


Number of repositories: 1032
19   2023-12-01
20   2024-01-01
21   2024-02-01
22   2024-03-01
23   2024-04-01
Name: month, dtype: datetime64[ns]


Generating plots: 100%|██████████| 5/5 [00:06<00:00,  1.21s/it]
