# IMPORTS

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from pathlib import Path
import json


# Import raw data

**Get raw Json files from data/raw**

In [34]:
raw_data_path = Path('../data/raw')
raw_json_files = list(raw_data_path.glob('*.json'))

print(f'{len(raw_json_files)} raw data files found:\n')
for file in raw_json_files:
    print(f'\t-{file.name}')

6 raw data files found:

	-scikit-learn_scikit-learn_20251110_075413.json
	-mlflow_mlflow_20251110_081117.json
	-combined_data_20251110_083013.json
	-numpy_numpy_20251110_075413.json
	-pandas-dev_pandas_20251110_075413.json
	-apache_airflow_20251110_081117.json


**load data for each repo**

In [35]:
repo_data = {}

# store data in  repo as a list of dictionaries:
for file in raw_json_files:
    with open(file, 'r') as f:
        data = json.load(f)

    # Extract filename to use as key in repo_data:
    repo_name = file.stem.rsplit('_', 2)[0].replace('_', '/', 1)
    repo_data[repo_name] = data

print(f'Loaded data for {len(repo_data)} repositories:\n')
for repo in repo_data.keys():
    print(f'\t-{repo}')

Loaded data for 6 repositories:

	-scikit-learn/scikit-learn
	-mlflow/mlflow
	-combined/data
	-numpy/numpy
	-pandas-dev/pandas
	-apache/airflow


**Checkout one repo data structure**

In [36]:
for repo in repo_data.keys():
    if repo == 'pandas-dev/pandas':
        repo_pd = repo_data[repo]

        print("Keys in the data:")
        print(repo_pd.keys())
        print(f'\nNumber of issues: {len(repo_pd['issues'])}')
        print(f'Number of commits: {len(repo_pd['commits'])}')

Keys in the data:
dict_keys(['repository', 'collected at', 'info', 'issues', 'commits'])

Number of issues: 100
Number of commits: 100


*check structure of issues*

In [37]:
#Examine one issue:
issues_chk = repo_pd['issues'][0]

print("Issue keys:")
print(f'\n{issues_chk.keys()}')
print("\n--- Sample Issue ---\n")
print(f"Title: {issues_chk.get('title')}")
print(f"\nState: {issues_chk.get('state')}")
print(f"\nCreated: {issues_chk.get('created_at')}")
print(f"\nUser: {issues_chk.get('user', {}).get('login')}")

Issue keys:

dict_keys(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'])

--- Sample Issue ---

Title: ENH: Optional preservation of Sparse columns in Parquet/Feather via Arrow metadata

State: open

Created: 2025-11-09T02:01:16Z

User: antznette1


*check structure of commits*

In [38]:
# Examine one commit
commits_chk = repo_pd['commits'][0]
print("Commit keys:")
print(commits_chk.keys())
print("\n--- Sample Commit ---")
print(f"\nSHA: {commits_chk.get('sha')}")
print(f"\nAuthor: {commits_chk.get('commit', {}).get('author', {}).get('name')}")
print(f"\nDate: {commits_chk.get('commit', {}).get('author', {}).get('date')}")
print(f"\nMessage: {commits_chk.get('commit', {}).get('message', '')[:100]}...")

Commit keys:
dict_keys(['sha', 'node_id', 'commit', 'url', 'html_url', 'comments_url', 'author', 'committer', 'parents'])

--- Sample Commit ---

SHA: f4851e500a43125d505db64e548af0355227714b

Author: Matthew Roeschke

Date: 2025-11-08T20:40:33Z

Message: DOC: Remove ..versionadded directives before 2.0 (#63035)

Co-authored-by: Richard Shadrach <4556240...


# DATA WRANGLING

## DATA EXTRACTION AND FLATENING

**Define extract methods**

In [46]:
def extract_issues(repo_data):
    """Extract and flatten issues from all repositories"""
    all_issues = []

    for repo_name, data in repo_data.items():
        if 'combined' in repo_name:
            continue
        issues = data.get('issues', [])

        for issue in issues:
            
            flat_issue = {
                'repo_name': repo_name,
                'issue_id': issue.get('id'),
                'issue_number': issue.get('number'),
                'title': issue.get('title'),
                'state': issue.get('state'),
                'user_login': issue.get('user', {}).get('login'),
                'created_at': issue.get('created_at'),
                'updated_at': issue.get('updated_at'),
                'closed_at': issue.get('closed_at'),
                'comments': issue.get('comments'),
                'author_association': issue.get('author_association'),
                'labels_count': len(issue.get('labels', []))   # Count of labels
            }
            all_issues.append(flat_issue)
    return pd.DataFrame(all_issues)

In [74]:
def extract_commits(repo_data):
    """Extract and flatten commits"""
    all_commits = []

    for repo_name, data in repo_data.items():
        if 'combined' in repo_name:
            continue
        commits = data.get('commits', [])

        for commit in commits:
            commit_data = commit.get('commit', {})
            author_data = commit_data.get('author', {})
            
            flat_commit = {
                'repo_name': repo_name,
                'repo_sha': commit.get('sha'),
                'author_name': author_data.get('name'),
                'author_email': author_data.get('email'),
                'author_date': author_data.get('date'),
                'commiter_name': commit_data.get('committer', {}).get('name'),
                'commit_date': commit_data.get('committer', {}).get('date'),
                'message': commit_data.get('message'),
                'message_length': len(commit_data.get('message', ''))
            }
            
            all_commits.append(flat_commit)
    return pd.DataFrame(all_commits) 

In [75]:
# Extract Issues:
issues_df = extract_issues(repo_data)
print(f"Extracted {len(issues_df)} issues")
print(f"\nDataFrame shape: {issues_df.shape}\n")
issues_df

Extracted 500 issues

DataFrame shape: (500, 12)



Unnamed: 0,repo_name,issue_id,issue_number,title,state,user_login,created_at,updated_at,closed_at,comments,author_association,labels_count
0,scikit-learn/scikit-learn,3604820995,32680,RFC: Should we remove the 'good first issue' l...,open,lucyleeow,2025-11-09T11:08:05Z,2025-11-10T00:16:58Z,,2,MEMBER,1
1,scikit-learn/scikit-learn,3604361646,32678,⚠️ CI failed on Linux_Runs.pylatest_conda_forg...,closed,scikit-learn-bot,2025-11-09T02:35:37Z,2025-11-09T12:14:17Z,2025-11-09T12:14:10Z,1,CONTRIBUTOR,1
2,scikit-learn/scikit-learn,3603269688,32675,Feature Request: Add explained variance ratios...,open,paucablop,2025-11-08T09:40:11Z,2025-11-08T09:47:54Z,,0,NONE,2
3,scikit-learn/scikit-learn,3601823994,32671,load_iris example does not match text,closed,steveire,2025-11-07T20:02:31Z,2025-11-09T08:01:23Z,2025-11-09T08:01:23Z,3,NONE,1
4,scikit-learn/scikit-learn,3595356150,32665,StandardScaler raises unclear error with empty...,open,Savithru7142,2025-11-06T11:19:14Z,2025-11-06T15:39:01Z,,2,NONE,1
...,...,...,...,...,...,...,...,...,...,...,...,...
495,apache/airflow,3567902633,57515,KubernetesPodOperator Test test_async_write_lo...,open,jscheffl,2025-10-29T21:09:06Z,2025-10-29T21:14:00Z,,0,CONTRIBUTOR,4
496,apache/airflow,3567766742,57512,CloudComposerDAGRunSensor,open,Git4Vishal,2025-10-29T20:32:47Z,2025-10-31T10:43:59Z,,2,NONE,5
497,apache/airflow,3566934352,57507,Error when email_on_failure is enabled due to ...,closed,karenbraganz,2025-10-29T16:42:44Z,2025-11-05T17:32:45Z,2025-11-05T17:32:45Z,4,COLLABORATOR,7
498,apache/airflow,3566777445,57498,Task-level params in HITLOperator are merged w...,open,jgoedeke,2025-10-29T16:03:44Z,2025-10-30T09:43:17Z,,4,NONE,3


In [76]:
# Extract commits:
commits_df = extract_commits(repo_data)
print(f"Extracted {len(commits_df)} commits")
print(f"\nDataFrame shape: {commits_df.shape}\n")
commits_df

Extracted 500 commits

DataFrame shape: (500, 9)



Unnamed: 0,repo_name,repo_sha,author_name,author_email,author_date,commiter_name,commit_date,message,message_length
0,scikit-learn/scikit-learn,7cf4fcb3d4f73ab4d4284ddb423e23b42cdb8e1f,Christine P. Chai,star1327p@gmail.com,2025-11-10T02:35:49Z,GitHub,2025-11-10T02:35:49Z,DOC: Improve formatting of `sklearn.mixture` A...,71
1,scikit-learn/scikit-learn,4e2f1b7094d27ddca17bee1eee61af2ab20a7d23,Arpan Mukherjee,mukherjeearpan381@gmail.com,2025-11-09T08:01:22Z,GitHub,2025-11-09T08:01:22Z,DOC Improve load_iris docstring example (#32677),48
2,scikit-learn/scikit-learn,6a75463b13351258cb93758ffcd9ccf20c366c04,roychan,roychan@users.noreply.github.com,2025-11-09T06:27:04Z,GitHub,2025-11-09T06:27:04Z,DOC Fix a typo in plot_calibration_multiclass....,57
3,scikit-learn/scikit-learn,adca6aedab552925e2689f2a885b0a6e094d1659,Christine P. Chai,star1327p@gmail.com,2025-11-08T09:42:35Z,GitHub,2025-11-08T09:42:35Z,DOC: Add reference link to LocalOutlierFactor ...,60
4,scikit-learn/scikit-learn,0cf432642e979ab837a40986231972fa38a794a3,Josef Affourtit,josef.affourtit@gmail.com,2025-11-08T04:43:21Z,GitHub,2025-11-08T04:43:21Z,Add array API support to `calinski_harabasz_sc...,115
...,...,...,...,...,...,...,...,...,...
495,apache/airflow,9607baef89bc9bf50c2805edb19fff171a1e37e7,Ash Berlin-Taylor,ash@apache.org,2025-11-06T18:38:27Z,GitHub,2025-11-06T18:38:27Z,Update the version of postgres we test against...,185
496,apache/airflow,828c04c4f5c84423ee91bf667e98e25a6368e349,Jens Scheffler,95105677+jscheffl@users.noreply.github.com,2025-11-06T18:38:19Z,GitHub,2025-11-06T18:38:19Z,Remove global from Fernet Crypto tooling (#576...,170
497,apache/airflow,a29391b177a8c27af2a17c1faea82bbdb63e5b5a,ChenChen Lai,72776271+0lai0@users.noreply.github.com,2025-11-06T17:15:16Z,GitHub,2025-11-06T17:15:16Z,Enable PT006 rule to trino Provider test (#57931),49
498,apache/airflow,1eb960e09d5ef087187254d02b8984da18151c30,GUAN-HAO HUANG,101171023+rich7420@users.noreply.github.com,2025-11-06T17:14:45Z,GitHub,2025-11-06T17:14:45Z,modify test_variables (#57945)\n\nSigned-off-b...,125
