# v1 combine all output csv (naive)

In [None]:
import pandas as pd
import glob

# List of metrics
metrics = ["commits", "contributors", "forks", "pull_requests", "stars"]

# Loop through each metric
for metric in metrics:
    # Get all CSV files for the current metric
    csv_files = sorted(glob.glob(f"*_{metric}_metrics.csv"))

    # Read and concatenate all CSV files for this metric
    df_list = [pd.read_csv(file, index_col=0) for file in csv_files]
    combined_df = pd.concat(df_list)

    # Drop duplicates if any (keeping the first occurrence)
    combined_df = combined_df[~combined_df.index.duplicated(keep='first')]

    # Save the combined CSV
    combined_df.to_csv(f"{metric}_combined.csv")
    print(f"Saved {metric}_combined.csv")


Saved commits_combined.csv
Saved contributors_combined.csv
Saved forks_combined.csv
Saved pull_requests_combined.csv
Saved stars_combined.csv


# v2 - csv metric merger (account for non-matching dates/repo)

In [None]:
# Simple CSV Merger for GitHub Metrics Data

import pandas as pd
import os
import glob
import re
from datetime import datetime

# Function to combine CSV files by metric type
def combine_metric_files(directory='.', output_dir='output'):
    """
    Combines CSV files by metric type (commits, pull_requests, stars, contributors, forks)
    and saves them as separate files.

    Args:
        directory (str): Directory containing the input CSV files
        output_dir (str): Directory to save the output files
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Define metric types to look for
    metric_types = ['commits', 'pull_requests', 'stars', 'contributors', 'forks']

    # Process each metric type
    for metric_type in metric_types:
        print(f"Processing {metric_type} files...")

        # Find all CSV files for this metric type
        pattern = f"combined_{metric_type}_metrics.csv"
        files = glob.glob(os.path.join(directory, pattern))

        if not files:
            print(f"  - No files found for {metric_type}")
            continue

        print(f"  - Found {len(files)} files")

        # List to store dataframes for this metric type
        dfs = []

        # Process each file
        for file in files:
            try:
                # Read the file with the first column as index (GitHub repo URLs)
                df = pd.read_csv(file, index_col=0)

                print(f"    - Reading {os.path.basename(file)} with shape {df.shape}")

                # Add to the list of dataframes
                dfs.append(df)
            except Exception as e:
                print(f"    - Error reading {file}: {e}")

        if not dfs:
            print(f"  - No valid data found for {metric_type}")
            continue

        # Combine all dataframes for this metric type
        # This will automatically align the data by index (repo URLs) and columns (dates)
        # Missing values will be NaN
        combined_df = pd.concat(dfs, axis=1)

        # Remove duplicate columns if any
        combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]

        # Sort columns (dates) chronologically
        try:
            combined_df = combined_df.reindex(sorted(combined_df.columns,
                                                    key=lambda x: datetime.strptime(x, '%Y-%m')),
                                             axis=1)
        except Exception as e:
            print(f"    - Warning: Could not sort columns chronologically: {e}")

        # Save the combined file
        output_file = os.path.join(output_dir, f"{metric_type}_combined.csv")
        combined_df.to_csv(output_file)

        print(f"  - Saved combined file to {output_file} with shape {combined_df.shape}")
        print(f"  - Contains data for {len(combined_df.index)} repositories and {len(combined_df.columns)} date columns")


In [None]:
# Run the function
combine_metric_files()


Processing commits files...
  - Found 6 files
    - Reading [grace_6months]_commits_metrics.csv with shape (96, 6)
    - Reading [grace]_commits_metrics.csv with shape (85, 27)
    - Reading [thai]_commits_metrics.csv with shape (92, 27)
    - Reading [thai_6months]_commits_metrics.csv with shape (100, 6)
    - Reading [pao]_commits_metrics.csv with shape (115, 27)
    - Reading [pao_6months]_commits_metrics.csv with shape (96, 6)
  - Saved combined file to output/commits_combined.csv with shape (292, 33)
  - Contains data for 292 repositories and 33 date columns
Processing pull_requests files...
  - Found 6 files
    - Reading [thai]_pull_requests_metrics.csv with shape (92, 27)
    - Reading [thai_6months]_pull_requests_metrics.csv with shape (100, 6)
    - Reading [pao]_pull_requests_metrics.csv with shape (115, 27)
    - Reading [grace_6months]_pull_requests_metrics.csv with shape (96, 6)
    - Reading [pao_6months]_pull_requests_metrics.csv with shape (96, 6)
    - Reading [grace]

In [None]:
# Example of loading and displaying a combined file
# Uncomment these lines to see a preview of the combined data
"""
# Load a combined file
combined_commits = pd.read_csv('output/commits_combined.csv', index_col=0)

# Display the first few rows
print("\nSample of combined commits data:")
display(combined_commits.head())

# Display basic statistics
print("\nBasic statistics:")
print(f"Number of repositories: {len(combined_commits.index)}")
print(f"Date range: {combined_commits.columns[0]} to {combined_commits.columns[-1]}")
print(f"Total data points: {combined_commits.count().sum()}")
"""

# format for RQ3 thai sentiment hackernews submission

In [4]:
import pandas as pd
from datetime import datetime, timedelta
import os

def load_metrics(metrics_dir):
    """
    Load all metric CSVs into a dictionary of DataFrames.

    Args:
        metrics_dir (str): Directory containing metric CSV files

    Returns:
        dict: Dictionary with metric names as keys and DataFrames as values
    """
    metrics = {}
    metric_files = ['stars', 'commits', 'pull_requests', 'forks', 'contributors']

    for metric in metric_files:
        file_path = os.path.join(metrics_dir, f'combined_{metric}_metrics.csv')
        if os.path.exists(file_path):
            metrics[metric] = pd.read_csv(file_path, index_col=0)
        else:
            print(f"Warning: {metric}_metrics.csv not found")

    return metrics

def unix_to_datetime(unix_timestamp):
    """Convert Unix timestamp to datetime object."""
    return datetime.fromtimestamp(int(unix_timestamp))

def find_closest_date_column(df, target_date):
    """
    Find the closest date column in the metrics DataFrame to the target date.

    Args:
        df (pd.DataFrame): Metrics DataFrame
        target_date (datetime): Target date to match

    Returns:
        str: Name of the closest date column
    """
    # Convert column names to datetime objects
    date_cols = [datetime.strptime(col, '%Y-%m') for col in df.columns]

    # Find the column with the minimum absolute difference
    closest_date = min(date_cols, key=lambda x: abs(x - target_date))

    # Convert back to original format
    return closest_date.strftime('%Y-%m')

def get_metric_value(df, repo_url, date_col):
    """
    Get metric value for a specific repository and date.
    Returns -1 if data is not available.
    """
    try:
        return df.loc[repo_url, date_col]
    except:
        return -1

def process_metrics(hn_data_path, metrics_dir, output_path):
    """
    Process metrics and create consolidated CSV file.

    Args:
        hn_data_path (str): Path to HackerNews submission data CSV
        metrics_dir (str): Directory containing metric CSVs
        output_path (str): Path for output CSV file
    """
    # Load HackerNews submission data
    hn_data = pd.read_csv(hn_data_path)

    # Load all metrics
    metrics_dict = load_metrics(metrics_dir)
    if not metrics_dict:
        raise ValueError("No metric files found")

    # Initialize result DataFrame with HackerNews metadata
    result_df = hn_data.copy()

    # Process each repository
    for idx, row in result_df.iterrows():
        repo_url = row['url']
        submission_date = unix_to_datetime(row['date'])

        # Process each metric type
        for metric_name, metric_df in metrics_dict.items():
            # Get metrics at submission date
            closest_submit_date = find_closest_date_column(metric_df, submission_date)
            result_df.at[idx, f'{metric_name}_at_submission'] = get_metric_value(
                metric_df, repo_url, closest_submit_date)

            # Get metrics for each month after submission
            for month in range(1, 6):
                target_date = submission_date + timedelta(days=30 * month)
                closest_month_date = find_closest_date_column(metric_df, target_date)
                result_df.at[idx, f'{metric_name}_month_{month}'] = get_metric_value(
                    metric_df, repo_url, closest_month_date)

    # Save the result
    result_df.to_csv(output_path, index=False)
    print(f"Saved consolidated metrics to {output_path}")

    # Print sample of the output
    print("\nFirst few rows of the output:")
    print(result_df.head())

In [5]:
# Update these paths according to your file structure
HN_DATA_PATH = "/content/drive/MyDrive/datasets/muict-naist-senior/rq1/rq1_freq_analysis/rq1_stories_github.csv"
METRICS_DIR = "./"
OUTPUT_PATH = "consolidated_metrics.csv"

process_metrics(HN_DATA_PATH, METRICS_DIR, OUTPUT_PATH)

Saved consolidated metrics to consolidated_metrics.csv

First few rows of the output:
   discussion_id                                              title  \
0       31355348  BlindAI: Open-source, fast and privacy-friendl...   
1       31405976          OpenAI Codex Python to C++ Code Generator   
2       31831437                                   Gemini with IPFS   
3       31846593  YaLM-100B: Pretrained language model with 100B...   
4       32458048  Paradigms of Artificial Intelligence Programmi...   

                                           url        date  \
0  https://github.com/mithril-security/blindai  1652368380   
1    https://github.com/alxschwrz/codex_py2cpp  1652761021   
2     https://github.com/JonStratton/geminipfs  1655865106   
3          https://github.com/yandex/YaLM-100B  1655974826   
4          https://github.com/norvig/paip-lisp  1660472194   

   stars_at_submission  stars_month_1  stars_month_2  stars_month_3  \
0                222.0          236.0      