In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path

# Paths
RAW_DATA_PATH = "/info/raid-etu/m2/s2308975/big_data/repositories"
PROCESSED_DATA_PATH = "/info/raid-etu/m2/s2405959/BigData/data/processed"
TIMESERIES_PATH = f"{PROCESSED_DATA_PATH}/timeseries"

# Select one repository to explore
REPO_NAME = "0voice__interview_internal_reference"

# Repository Data Exploration

Comparing raw data vs processed time series for model input.

## Load Raw Data

In [2]:
# Load raw data for the selected repository
repo_path = Path(RAW_DATA_PATH) / REPO_NAME

# Load all raw data files
commits_raw = pd.read_csv(repo_path / "commits.csv")
issues_raw = pd.read_csv(repo_path / "issues.csv")
prs_raw = pd.read_csv(repo_path / "pull_requests.csv")
repository_info = pd.read_csv(repo_path / "repository.csv")
stargazers_raw = pd.read_csv(repo_path / "stargazers.csv")

print(f"Repository: {REPO_NAME}")
print(f"{'='*70}\n")

Repository: 0voice__interview_internal_reference



In [3]:
# Repository information
print("Repository Information")
print("-" * 50)
print(repository_info.to_string())
print()

Repository Information
--------------------------------------------------
     run_timestamp repo_owner                     repo_name                            full_name                                      description language            created_at            updated_at             pushed_at default_branch  license  stargazers_count  forks_count  watchers_count  open_issues_count  is_private  is_fork  is_archived  has_issues  has_projects  has_wiki  size_kb  commits_count  issues_count  pull_requests_count  stargazer_events                  fetched_at
0  20251125_102446     0voice  interview_internal_reference  0voice/interview_internal_reference  2025年最新总结，阿里，腾讯，百度，美团，头条等技术面试题目，以及答案，专家出题人分析汇总。   Python  2019-06-10T06:54:19Z  2025-11-27T16:28:08Z  2025-10-22T09:39:06Z         master      NaN             37065         9448           37065                 35       False    False        False        True          True      True     1161            378            51                   45 

In [4]:
# Raw data statistics
print("\nRaw Data Statistics")
print("-" * 50)
print(f"Commits:         {len(commits_raw):,}")
print(f"Issues:          {len(issues_raw):,}")
print(f"Pull Requests:   {len(prs_raw):,}")
print(f"Stargazers:      {len(stargazers_raw):,}")


Raw Data Statistics
--------------------------------------------------
Commits:         378
Issues:          51
Pull Requests:   45
Stargazers:      37,065


In [5]:
# Sample commits
print("\nCommits Sample (first 10 rows)")
print("-" * 50)
print(commits_raw.head(10))


Commits Sample (first 10 rows)
--------------------------------------------------
                                        sha author_login author_name  \
0  a549376da3ff8430a03e00ad7e1878ca00a063ad   wangbojing  wangbojing   
1  9fe6c758e98c03c40c8908c39e78ab98a7ab53d6   wangbojing  wangbojing   
2  bb7bf70e0645fbf2a24a3d53066a6057f12aa748   wangbojing         王博靖   
3  50dc16bcac94fd7e7559e34d221f5c38634db3ef       yttsam      yttsam   
4  68957aa24e94d6847862b3886d80eae904b560b0       yttsam      yttsam   
5  e0926ff3d2aa1cc38f9fb753a367aa4ea56c8b73       yttsam      yttsam   
6  f8ebcfe1dceb76593f080b5e0194a037c267f7d3       yttsam      yttsam   
7  0049428aea4a1e8f538719628f60222ff1093734       yttsam      yttsam   
8  38349102b7eb4a9c280a452dfe3fef769d2caa06       yttsam      yttsam   
9  95eba1c94e6186a5841053f488631edf9d70ba60       yttsam      yttsam   

                               author_email           author_date  \
0       wangbojing@users.noreply.github.com  2025-10-22

In [6]:
# Sample issues
print("\nIssues Sample (first 10 rows)")
print("-" * 50)
print(issues_raw.head(10))


Issues Sample (first 10 rows)
--------------------------------------------------
          id  number   state                                       title  \
0  461272768       1  closed                                1.1.2 sqrt实现   
1  468016266       3  closed                            README里好多题没有答案哦？   
2  468523520       5  closed                                   这是关于哪里岗位的   
3  468943237       6  closed                                  中间好多没有答案的呢   
4  469006916       8  closed                                    标题确定没问题？   
5  469560121       9  closed    1.1.9 输入 ping IP 后敲回车，发包前会发生什么？这题回答讲的不明白   
6  470217049      12    open         1.1.4 LRU缓存机制的 Python 实现不好，并不是 O(1)   
7  470379252      13  closed                                     数据库优化思路   
8  470481745      14  closed  Can not clone the repo: File name too long   
9  470688711      15    open                                  1.3.5 答案有误   

      user_login  labels  assignees  is_locked  comments  \
0  github-linong     

## Load Processed Data

In [7]:
# Load quarterly aggregated data
quarterly_data = pd.read_csv(f"{PROCESSED_DATA_PATH}/quarterly_aggregated.csv")
repo_quarterly = quarterly_data[quarterly_data['repo_id'] == REPO_NAME].copy()

print(f"\nQuarterly Aggregated Data: {REPO_NAME}")
print("-" * 50)
print(f"Number of quarters: {len(repo_quarterly)}")
print(f"Shape: {repo_quarterly.shape}")
print(f"\n{repo_quarterly}")


Quarterly Aggregated Data: 0voice__interview_internal_reference
--------------------------------------------------
Number of quarters: 27
Shape: (27, 10)

                                 repo_id  year  quarter  commit_count  \
0   0voice__interview_internal_reference  2019        2         190.0   
1   0voice__interview_internal_reference  2019        3          78.0   
2   0voice__interview_internal_reference  2019        4          45.0   
3   0voice__interview_internal_reference  2020        1           4.0   
4   0voice__interview_internal_reference  2020        2           4.0   
5   0voice__interview_internal_reference  2020        3           7.0   
6   0voice__interview_internal_reference  2020        4           2.0   
7   0voice__interview_internal_reference  2021        1           1.0   
8   0voice__interview_internal_reference  2021        2           6.0   
9   0voice__interview_internal_reference  2021        3           2.0   
10  0voice__interview_internal_reference 

In [8]:
# Load metadata
with open(f"{TIMESERIES_PATH}/metadata.json", 'r') as f:
    metadata = json.load(f)

print("\nTime Series Metadata")
print("-" * 50)
for key, value in metadata.items():
    print(f"{key:20s}: {value}")


Time Series Metadata
--------------------------------------------------
lookback            : 4
horizon             : 1
n_features          : 7
feature_names       : ['commit_count', 'total_contributors', 'issue_count', 'issue_closed', 'pr_count', 'pr_merged', 'star_count']
activity_threshold  : 766.275
n_train             : 3593
n_dev               : 4928
n_test              : 6186
n_repositories      : 499


In [10]:
# Load training data
train_data = np.load(f"{TIMESERIES_PATH}/train.npz")
X_train = train_data['lookback_features']
y_train = train_data['target_metrics']

print("\nTraining Data")
print("-" * 50)
print(f"X_train shape: {X_train.shape}")
print(f"  Samples: {X_train.shape[0]:,}")
print(f"  Lookback: {X_train.shape[1]}")
print(f"  Features: {X_train.shape[2]}")
print(f"\ny_train shape: {y_train.shape}")
print(f"Features: {metadata['feature_names']}")


Training Data
--------------------------------------------------
X_train shape: (3593, 4, 7)
  Samples: 3,593
  Lookback: 4
  Features: 7

y_train shape: (3593, 1, 7)
Features: ['commit_count', 'total_contributors', 'issue_count', 'issue_closed', 'pr_count', 'pr_merged', 'star_count']


## Data Comparison

In [11]:
print("\nData Transformation Overview")
print("=" * 50)

print("\nRaw Data (event-level):")
print(f"  Commits: {len(commits_raw):,}")
print(f"  Issues: {len(issues_raw):,}")
print(f"  PRs: {len(prs_raw):,}")
print(f"  Stargazers: {len(stargazers_raw):,}")

print("\nAggregated Data (quarterly):")
print(f"  Quarters: {len(repo_quarterly)}")
print(f"  Features: {', '.join(metadata['feature_names'])}")

print("\nTime Series (model input):")
print(f"  Samples: {X_train.shape[0]:,}")
print(f"  Lookback: {metadata['lookback']} quarters")
print(f"  Horizon: {metadata['horizon']} quarter")
print(f"  Shape: ({metadata['lookback']}, {metadata['n_features']})")


Data Transformation Overview

Raw Data (event-level):
  Commits: 378
  Issues: 51
  PRs: 45
  Stargazers: 37,065

Aggregated Data (quarterly):
  Quarters: 27
  Features: commit_count, total_contributors, issue_count, issue_closed, pr_count, pr_merged, star_count

Time Series (model input):
  Samples: 3,593
  Lookback: 4 quarters
  Horizon: 1 quarter
  Shape: (4, 7)


In [12]:
# Example time series sample
sample_idx = 0
print(f"\nExample Sample {sample_idx}")
print("=" * 50)
print(f"Input shape: {X_train[sample_idx].shape}")
print(f"Target: {y_train[sample_idx]}\n")

print("Input features:")
print(f"{'Quarter':<10} " + " ".join(f"{feat:<12}" for feat in metadata['feature_names']))
print("-" * 100)

for t in range(X_train[sample_idx].shape[0]):
    print(f"t-{X_train[sample_idx].shape[0]-t-1:<8} ", end="")
    for f in range(X_train[sample_idx].shape[1]):
        print(f"{X_train[sample_idx][t, f]:>12.1f} ", end="")
    print()

print(f"\nPrediction target: {y_train[sample_idx]}")


Example Sample 0
Input shape: (4, 7)
Target: [[303.   1.   0.   0.   0.   0.   0.]]

Input features:
Quarter    commit_count total_contributors issue_count  issue_closed pr_count     pr_merged    star_count  
----------------------------------------------------------------------------------------------------
t-3                 1.0          1.0          0.0          0.0          0.0          0.0          0.0 
t-2               135.0          2.0          0.0          0.0          0.0          0.0          0.0 
t-1               129.0          1.0          0.0          0.0          0.0          0.0          0.0 
t-0               143.0          1.0          0.0          0.0          0.0          0.0          0.0 

Prediction target: [[303.   1.   0.   0.   0.   0.   0.]]


In [14]:
# Summary
print("\nSummary")
print("=" * 50)

print("\nGranularity:")
print("  Raw:        Event-level (individual commits, issues, PRs)")
print("  Processed:  Quarterly aggregations")
print("  Time Series: Sliding windows")

print("\nStructure:")
print("  Raw:        Multiple CSV files per repo")
print("  Processed:  Single CSV with metrics")
print("  Time Series: 3D arrays (samples × lookback × features)")

print("\nModel Input:")
print(f"  Window: {metadata['lookback']} consecutive quarters")
print(f"  Features: {metadata['n_features']} per quarter")
print(f"  Output: Next quarter prediction")


Summary

Granularity:
  Raw:        Event-level (individual commits, issues, PRs)
  Processed:  Quarterly aggregations
  Time Series: Sliding windows

Structure:
  Raw:        Multiple CSV files per repo
  Processed:  Single CSV with metrics
  Time Series: 3D arrays (samples × lookback × features)

Model Input:
  Window: 4 consecutive quarters
  Features: 7 per quarter
  Output: Next quarter prediction
