# IMPORTS

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt, timezone
from pathlib import Path
import json
import os


# Import raw data

**Get raw Json files from data/raw**

In [2]:
raw_data_path = Path('../data/raw')
raw_json_files = list(raw_data_path.glob('*.json'))

print(f'{len(raw_json_files)} raw data files found:\n')
for file in raw_json_files:
    print(f'\t-{file.name}')

6 raw data files found:

	-scikit-learn_scikit-learn_20251110_075413.json
	-mlflow_mlflow_20251110_081117.json
	-combined_data_20251110_083013.json
	-numpy_numpy_20251110_075413.json
	-pandas-dev_pandas_20251110_075413.json
	-apache_airflow_20251110_081117.json


**load data for each repo**

In [3]:
repo_data = {}

# store data in  repo as a list of dictionaries:
for file in raw_json_files:
    with open(file, 'r') as f:
        data = json.load(f)

    # Extract filename to use as key in repo_data:
    repo_name = file.stem.rsplit('_', 2)[0].replace('_', '/', 1)
    repo_data[repo_name] = data

print(f'Loaded data for {len(repo_data)} repositories:\n')
for repo in repo_data.keys():
    print(f'\t-{repo}')

Loaded data for 6 repositories:

	-scikit-learn/scikit-learn
	-mlflow/mlflow
	-combined/data
	-numpy/numpy
	-pandas-dev/pandas
	-apache/airflow


**Checkout one repo data structure**

In [4]:
for repo in repo_data.keys():
    if repo == 'pandas-dev/pandas':
        repo_pd = repo_data[repo]

        print("Keys in the data:")
        print(repo_pd.keys())
        print(f'\nNumber of issues: {len(repo_pd['issues'])}')
        print(f'Number of commits: {len(repo_pd['commits'])}')

Keys in the data:
dict_keys(['repository', 'collected at', 'info', 'issues', 'commits'])

Number of issues: 100
Number of commits: 100


##### *check structure of issues*

In [5]:
#Examine one issue:
issues_chk = repo_pd['issues'][0]

print("Issue keys:")
print(f'\n{issues_chk.keys()}')
print("\n--- Sample Issue ---\n")
print(f"Title: {issues_chk.get('title')}")
print(f"\nState: {issues_chk.get('state')}")
print(f"\nCreated: {issues_chk.get('created_at')}")
print(f"\nUser: {issues_chk.get('user', {}).get('login')}")

Issue keys:

dict_keys(['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'type', 'active_lock_reason', 'sub_issues_summary', 'issue_dependencies_summary', 'body', 'closed_by', 'reactions', 'timeline_url', 'performed_via_github_app', 'state_reason'])

--- Sample Issue ---

Title: ENH: Optional preservation of Sparse columns in Parquet/Feather via Arrow metadata

State: open

Created: 2025-11-09T02:01:16Z

User: antznette1


##### *check structure of commits*

In [6]:
# Examine one commit
commits_chk = repo_pd['commits'][0]
print("Commit keys:")
print(commits_chk.keys())
print("\n--- Sample Commit ---")
print(f"\nSHA: {commits_chk.get('sha')}")
print(f"\nAuthor: {commits_chk.get('commit', {}).get('author', {}).get('name')}")
print(f"\nDate: {commits_chk.get('commit', {}).get('author', {}).get('date')}")
print(f"\nMessage: {commits_chk.get('commit', {}).get('message', '')[:100]}...")

Commit keys:
dict_keys(['sha', 'node_id', 'commit', 'url', 'html_url', 'comments_url', 'author', 'committer', 'parents'])

--- Sample Commit ---

SHA: f4851e500a43125d505db64e548af0355227714b

Author: Matthew Roeschke

Date: 2025-11-08T20:40:33Z

Message: DOC: Remove ..versionadded directives before 2.0 (#63035)

Co-authored-by: Richard Shadrach <4556240...


##### *examine one info*

In [7]:
info_chk = repo_pd['info']

print("info keys:")
print(info_chk.keys())

info keys:
dict_keys(['id', 'node_id', 'name', 'full_name', 'private', 'owner', 'html_url', 'description', 'fork', 'url', 'forks_url', 'keys_url', 'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url', 'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url', 'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url', 'languages_url', 'stargazers_url', 'contributors_url', 'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url', 'comments_url', 'issue_comment_url', 'contents_url', 'compare_url', 'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url', 'milestones_url', 'notifications_url', 'labels_url', 'releases_url', 'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url', 'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size', 'stargazers_count', 'watchers_count', 'language', 'has_issues', 'has_projects', 'has_downloads', 'has_wiki', 'has_pages', 'has_discussions', 'forks_count', 'mirror_url', 'archived', 'disabl

# DATA WRANGLING

## DATA EXTRACTION AND FLATENING

**Define extract methods**

In [8]:
def extract_issues(repo_data):
    """Extract and flatten issues from all repositories"""
    all_issues = []

    for repo_name, data in repo_data.items():
        if 'combined' in repo_name:
            continue
        issues = data.get('issues', [])

        for issue in issues:
            
            flat_issue = {
                'repo_name': repo_name,
                'issue_id': issue.get('id'),
                'issue_number': issue.get('number'),
                'title': issue.get('title'),
                'state': issue.get('state'),
                'user_login': issue.get('user', {}).get('login'),
                'created_at': issue.get('created_at'),
                'updated_at': issue.get('updated_at'),
                'closed_at': issue.get('closed_at'),
                'comments': issue.get('comments'),
                'author_association': issue.get('author_association'),
                'labels_count': len(issue.get('labels', []))   # Count of labels
            }
            all_issues.append(flat_issue)
    return pd.DataFrame(all_issues)

In [9]:
def extract_commits(repo_data):
    """Extract and flatten commits from all repositories"""
    all_commits = []
    
    for repo_name, data in repo_data.items():
        # Skip the combined data file
        if 'combined' in repo_name:
            continue
            
        commits = data.get('commits', [])
        
        for commit in commits:
            commit_data = commit.get('commit', {})
            author_data = commit_data.get('author', {})
            
            flat_commit = {
                'repo_name': repo_name,
                'sha': commit.get('sha'),
                'author_name': author_data.get('name'),
                'author_email': author_data.get('email'),
                'author_date': author_data.get('date'),
                'committer_name': commit_data.get('committer', {}).get('name'),
                'commit_date': commit_data.get('committer', {}).get('date'),
                'description': commit_data.get('description'),
                'message': commit_data.get('message'),
                'message_length': len(commit_data.get('message', '')),
            }
            all_commits.append(flat_commit)
    
    return pd.DataFrame(all_commits)


In [10]:
def extract_info(repo_data):
    """Extract and flatten info"""
    all_info = []

    for repo_name, data in repo_data.items():
        print(f'Processing: {repo_name}')
        if 'combined' in repo_name:
            print(f'Skipping: {repo_name}')
            continue
        info = data.get('info', None)
        print(f'info type: {type(info)}')
    
        flat_info = {
            'repo_name': repo_name,
            'stars': info.get('stargazers_count'),
            'forks': info.get('forks_count'),
            'open_issues': info.get('open_issues_count'),
            'language': info.get('language'),
            'description': info.get('description'),
            'created_at': info.get('created_at'),
            'updated_at': info.get('updated_at'),
            'pushed_at': info.get('pushed_at')
        }
        all_info.append(flat_info)
            
    return pd.DataFrame(all_info)

**Extract and flatten data into dataframes**

In [11]:
# Extract Issues:
issues_df = extract_issues(repo_data)
print(f"Extracted {len(issues_df)} issues")
print(f"\nDataFrame shape: {issues_df.shape}\n")
issues_df.head()

Extracted 500 issues

DataFrame shape: (500, 12)



Unnamed: 0,repo_name,issue_id,issue_number,title,state,user_login,created_at,updated_at,closed_at,comments,author_association,labels_count
0,scikit-learn/scikit-learn,3604820995,32680,RFC: Should we remove the 'good first issue' l...,open,lucyleeow,2025-11-09T11:08:05Z,2025-11-10T00:16:58Z,,2,MEMBER,1
1,scikit-learn/scikit-learn,3604361646,32678,⚠️ CI failed on Linux_Runs.pylatest_conda_forg...,closed,scikit-learn-bot,2025-11-09T02:35:37Z,2025-11-09T12:14:17Z,2025-11-09T12:14:10Z,1,CONTRIBUTOR,1
2,scikit-learn/scikit-learn,3603269688,32675,Feature Request: Add explained variance ratios...,open,paucablop,2025-11-08T09:40:11Z,2025-11-08T09:47:54Z,,0,NONE,2
3,scikit-learn/scikit-learn,3601823994,32671,load_iris example does not match text,closed,steveire,2025-11-07T20:02:31Z,2025-11-09T08:01:23Z,2025-11-09T08:01:23Z,3,NONE,1
4,scikit-learn/scikit-learn,3595356150,32665,StandardScaler raises unclear error with empty...,open,Savithru7142,2025-11-06T11:19:14Z,2025-11-06T15:39:01Z,,2,NONE,1


In [12]:
# Extract commits:
commits_df = extract_commits(repo_data)
print(f"Extracted {len(commits_df)} commits")
print(f"\nDataFrame shape: {commits_df.shape}\n")
commits_df.head()

Extracted 500 commits

DataFrame shape: (500, 10)



Unnamed: 0,repo_name,sha,author_name,author_email,author_date,committer_name,commit_date,description,message,message_length
0,scikit-learn/scikit-learn,7cf4fcb3d4f73ab4d4284ddb423e23b42cdb8e1f,Christine P. Chai,star1327p@gmail.com,2025-11-10T02:35:49Z,GitHub,2025-11-10T02:35:49Z,,DOC: Improve formatting of `sklearn.mixture` A...,71
1,scikit-learn/scikit-learn,4e2f1b7094d27ddca17bee1eee61af2ab20a7d23,Arpan Mukherjee,mukherjeearpan381@gmail.com,2025-11-09T08:01:22Z,GitHub,2025-11-09T08:01:22Z,,DOC Improve load_iris docstring example (#32677),48
2,scikit-learn/scikit-learn,6a75463b13351258cb93758ffcd9ccf20c366c04,roychan,roychan@users.noreply.github.com,2025-11-09T06:27:04Z,GitHub,2025-11-09T06:27:04Z,,DOC Fix a typo in plot_calibration_multiclass....,57
3,scikit-learn/scikit-learn,adca6aedab552925e2689f2a885b0a6e094d1659,Christine P. Chai,star1327p@gmail.com,2025-11-08T09:42:35Z,GitHub,2025-11-08T09:42:35Z,,DOC: Add reference link to LocalOutlierFactor ...,60
4,scikit-learn/scikit-learn,0cf432642e979ab837a40986231972fa38a794a3,Josef Affourtit,josef.affourtit@gmail.com,2025-11-08T04:43:21Z,GitHub,2025-11-08T04:43:21Z,,Add array API support to `calinski_harabasz_sc...,115


In [13]:
# Extract Info:
info_df = extract_info(repo_data)
print(f"Extracted {len(info_df)} info")
print(f"\nDataFrame shape: {info_df.shape}\n")
info_df

Processing: scikit-learn/scikit-learn
info type: <class 'dict'>
Processing: mlflow/mlflow
info type: <class 'dict'>
Processing: combined/data
Skipping: combined/data
Processing: numpy/numpy
info type: <class 'dict'>
Processing: pandas-dev/pandas
info type: <class 'dict'>
Processing: apache/airflow
info type: <class 'dict'>
Extracted 5 info

DataFrame shape: (5, 9)



Unnamed: 0,repo_name,stars,forks,open_issues,language,description,created_at,updated_at,pushed_at
0,scikit-learn/scikit-learn,63980,26419,2126,Python,scikit-learn: machine learning in Python,2010-08-17T09:43:38Z,2025-11-10T07:41:10Z,2025-11-10T02:35:49Z
1,mlflow/mlflow,22892,4977,2039,Python,The open source developer platform to build AI...,2018-06-05T16:05:58Z,2025-11-10T08:03:38Z,2025-11-10T07:04:34Z
2,numpy/numpy,30782,11673,2359,Python,The fundamental package for scientific computi...,2010-09-13T23:02:39Z,2025-11-10T07:07:24Z,2025-11-10T07:07:05Z
3,pandas-dev/pandas,47063,19273,3597,Python,Flexible and powerful data analysis / manipula...,2010-08-24T01:37:33Z,2025-11-10T05:48:10Z,2025-11-08T20:40:33Z
4,apache/airflow,43145,15915,1711,Python,Apache Airflow - A platform to programmaticall...,2015-04-13T18:04:58Z,2025-11-10T07:58:56Z,2025-11-10T06:55:58Z


#### ***Data Type conversions***

**Date conversions**

*for issues:*

In [14]:
issues_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   repo_name           500 non-null    object
 1   issue_id            500 non-null    int64 
 2   issue_number        500 non-null    int64 
 3   title               500 non-null    object
 4   state               500 non-null    object
 5   user_login          500 non-null    object
 6   created_at          500 non-null    object
 7   updated_at          500 non-null    object
 8   closed_at           250 non-null    object
 9   comments            500 non-null    int64 
 10  author_association  500 non-null    object
 11  labels_count        500 non-null    int64 
dtypes: int64(4), object(8)
memory usage: 47.0+ KB


In [15]:
issues_df['created_at'] = pd.to_datetime(issues_df['created_at'])
issues_df['updated_at'] = pd.to_datetime(issues_df['updated_at'])
issues_df['closed_at'] = pd.to_datetime(issues_df['closed_at'])

issues_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   repo_name           500 non-null    object             
 1   issue_id            500 non-null    int64              
 2   issue_number        500 non-null    int64              
 3   title               500 non-null    object             
 4   state               500 non-null    object             
 5   user_login          500 non-null    object             
 6   created_at          500 non-null    datetime64[ns, UTC]
 7   updated_at          500 non-null    datetime64[ns, UTC]
 8   closed_at           250 non-null    datetime64[ns, UTC]
 9   comments            500 non-null    int64              
 10  author_association  500 non-null    object             
 11  labels_count        500 non-null    int64              
dtypes: datetime64[ns, UTC](3), int64(4),

*for commits:*

In [16]:
commits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   repo_name       500 non-null    object
 1   sha             500 non-null    object
 2   author_name     500 non-null    object
 3   author_email    500 non-null    object
 4   author_date     500 non-null    object
 5   committer_name  500 non-null    object
 6   commit_date     500 non-null    object
 7   description     0 non-null      object
 8   message         500 non-null    object
 9   message_length  500 non-null    int64 
dtypes: int64(1), object(9)
memory usage: 39.2+ KB


In [17]:
commits_df['author_date'] = pd.to_datetime(commits_df['author_date'])
commits_df['commit_date'] = pd.to_datetime(commits_df['commit_date'])

commits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   repo_name       500 non-null    object             
 1   sha             500 non-null    object             
 2   author_name     500 non-null    object             
 3   author_email    500 non-null    object             
 4   author_date     500 non-null    datetime64[ns, UTC]
 5   committer_name  500 non-null    object             
 6   commit_date     500 non-null    datetime64[ns, UTC]
 7   description     0 non-null      object             
 8   message         500 non-null    object             
 9   message_length  500 non-null    int64              
dtypes: datetime64[ns, UTC](2), int64(1), object(7)
memory usage: 39.2+ KB


*for inf:*

In [18]:
info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   repo_name    5 non-null      object
 1   stars        5 non-null      int64 
 2   forks        5 non-null      int64 
 3   open_issues  5 non-null      int64 
 4   language     5 non-null      object
 5   description  5 non-null      object
 6   created_at   5 non-null      object
 7   updated_at   5 non-null      object
 8   pushed_at    5 non-null      object
dtypes: int64(3), object(6)
memory usage: 492.0+ bytes


In [19]:
info_df['created_at'] = pd.to_datetime(info_df['created_at'])
info_df['updated_at'] = pd.to_datetime(info_df['updated_at'])
info_df['pushed_at'] = pd.to_datetime(info_df['pushed_at'])

info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   repo_name    5 non-null      object             
 1   stars        5 non-null      int64              
 2   forks        5 non-null      int64              
 3   open_issues  5 non-null      int64              
 4   language     5 non-null      object             
 5   description  5 non-null      object             
 6   created_at   5 non-null      datetime64[ns, UTC]
 7   updated_at   5 non-null      datetime64[ns, UTC]
 8   pushed_at    5 non-null      datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](3), int64(3), object(3)
memory usage: 492.0+ bytes


**Null value handling**

*for issues:*

In [20]:
issues_df.isnull().sum()

repo_name               0
issue_id                0
issue_number            0
title                   0
state                   0
user_login              0
created_at              0
updated_at              0
closed_at             250
comments                0
author_association      0
labels_count            0
dtype: int64

In [21]:
# Check if the 250 missing 'closed_at' values correspond to open issues:

issues_df['state'].value_counts()

state
open      250
closed    250
Name: count, dtype: int64

In [22]:
issues_df['closed_at']

0                           NaT
1     2025-11-09 12:14:10+00:00
2                           NaT
3     2025-11-09 08:01:23+00:00
4                           NaT
                 ...           
495                         NaT
496                         NaT
497   2025-11-05 17:32:45+00:00
498                         NaT
499   2025-10-31 18:13:04+00:00
Name: closed_at, Length: 500, dtype: datetime64[ns, UTC]

*for commits:*

In [23]:
commits_df.isnull().sum()

repo_name           0
sha                 0
author_name         0
author_email        0
author_date         0
committer_name      0
commit_date         0
description       500
message             0
message_length      0
dtype: int64

In [24]:
# commits_df['description'].info()

"""The method above will throw an error since commits_df['description'] has been dropped below"""

"The method above will throw an error since commits_df['description'] has been dropped below"

In [25]:
#Because all the values in the description column aare null, I will drop the entire column

commits_df = commits_df.drop('description', axis=1)
commits_df.head()

"""The method above will throw an error since commits_df['description'] has been dropped below"""

"The method above will throw an error since commits_df['description'] has been dropped below"

In [26]:
commits_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   repo_name       500 non-null    object             
 1   sha             500 non-null    object             
 2   author_name     500 non-null    object             
 3   author_email    500 non-null    object             
 4   author_date     500 non-null    datetime64[ns, UTC]
 5   committer_name  500 non-null    object             
 6   commit_date     500 non-null    datetime64[ns, UTC]
 7   message         500 non-null    object             
 8   message_length  500 non-null    int64              
dtypes: datetime64[ns, UTC](2), int64(1), object(6)
memory usage: 35.3+ KB


*for info:*

In [27]:
info_df.isnull().sum()

repo_name      0
stars          0
forks          0
open_issues    0
language       0
description    0
created_at     0
updated_at     0
pushed_at      0
dtype: int64

In [28]:
#info_df has no null values

## Feature Engeneering

*for issues:*

In [29]:
# create a temp Series and sstore all NaT values as false and nonNull values as True
temp_df = pd.Series(dtype='bool', name='is_open')

for index, item in issues_df['closed_at'].items():
    if pd.isna(item):
        temp_df[index] = True
    else:
        temp_df[index] = False

temp_df       

0       True
1      False
2       True
3      False
4       True
       ...  
495     True
496     True
497    False
498     True
499    False
Name: is_open, Length: 500, dtype: bool

In [30]:
# insert new column 'is_open' between 'closed_at' and 'comments':


issues_df.insert(
    loc=issues_df.columns.get_loc('comments'),
    column='is_open',
    value=temp_df
)
"""The insert method above has already been called once, calling it again will raise a ValueError: cannot insert is_open, already exists. so it has been commented out"""

issues_df.head()

Unnamed: 0,repo_name,issue_id,issue_number,title,state,user_login,created_at,updated_at,closed_at,is_open,comments,author_association,labels_count
0,scikit-learn/scikit-learn,3604820995,32680,RFC: Should we remove the 'good first issue' l...,open,lucyleeow,2025-11-09 11:08:05+00:00,2025-11-10 00:16:58+00:00,NaT,True,2,MEMBER,1
1,scikit-learn/scikit-learn,3604361646,32678,⚠️ CI failed on Linux_Runs.pylatest_conda_forg...,closed,scikit-learn-bot,2025-11-09 02:35:37+00:00,2025-11-09 12:14:17+00:00,2025-11-09 12:14:10+00:00,False,1,CONTRIBUTOR,1
2,scikit-learn/scikit-learn,3603269688,32675,Feature Request: Add explained variance ratios...,open,paucablop,2025-11-08 09:40:11+00:00,2025-11-08 09:47:54+00:00,NaT,True,0,NONE,2
3,scikit-learn/scikit-learn,3601823994,32671,load_iris example does not match text,closed,steveire,2025-11-07 20:02:31+00:00,2025-11-09 08:01:23+00:00,2025-11-09 08:01:23+00:00,False,3,NONE,1
4,scikit-learn/scikit-learn,3595356150,32665,StandardScaler raises unclear error with empty...,open,Savithru7142,2025-11-06 11:19:14+00:00,2025-11-06 15:39:01+00:00,NaT,True,2,NONE,1


In [31]:
# Create a new column, 'issue_age_days' to store how long an issue is/was opened

now = pd.Timestamp.now(tz='UTC')

issues_df['issue_age_days'] = np.nan

closed_mask = issues_df['closed_at'].notna()
open_mask = issues_df['closed_at'].isna()

issues_df.loc[closed_mask, 'issue_age_days'] = (
    issues_df.loc[closed_mask, 'closed_at'] - issues_df.loc[closed_mask, 'created_at']
).dt.days
issues_df.loc[open_mask, 'issue_age_days'] = (
    now - issues_df.loc[open_mask, 'created_at']
).dt.days

issues_df.head()

Unnamed: 0,repo_name,issue_id,issue_number,title,state,user_login,created_at,updated_at,closed_at,is_open,comments,author_association,labels_count,issue_age_days
0,scikit-learn/scikit-learn,3604820995,32680,RFC: Should we remove the 'good first issue' l...,open,lucyleeow,2025-11-09 11:08:05+00:00,2025-11-10 00:16:58+00:00,NaT,True,2,MEMBER,1,3.0
1,scikit-learn/scikit-learn,3604361646,32678,⚠️ CI failed on Linux_Runs.pylatest_conda_forg...,closed,scikit-learn-bot,2025-11-09 02:35:37+00:00,2025-11-09 12:14:17+00:00,2025-11-09 12:14:10+00:00,False,1,CONTRIBUTOR,1,0.0
2,scikit-learn/scikit-learn,3603269688,32675,Feature Request: Add explained variance ratios...,open,paucablop,2025-11-08 09:40:11+00:00,2025-11-08 09:47:54+00:00,NaT,True,0,NONE,2,4.0
3,scikit-learn/scikit-learn,3601823994,32671,load_iris example does not match text,closed,steveire,2025-11-07 20:02:31+00:00,2025-11-09 08:01:23+00:00,2025-11-09 08:01:23+00:00,False,3,NONE,1,1.0
4,scikit-learn/scikit-learn,3595356150,32665,StandardScaler raises unclear error with empty...,open,Savithru7142,2025-11-06 11:19:14+00:00,2025-11-06 15:39:01+00:00,NaT,True,2,NONE,1,6.0


In [32]:
# create  an new column 'time_to_close_hours' to find the length of time it took to close issues only for closed issues:

issues_df['time_to_close_hours'] = np.nan

closed_mask = issues_df['closed_at'].notna()

issues_df.loc[closed_mask, 'time_to_close_hours'] = (
    issues_df.loc[closed_mask, 'closed_at'] - issues_df.loc[closed_mask, 'created_at']
).dt.total_seconds()/3600

issues_df

Unnamed: 0,repo_name,issue_id,issue_number,title,state,user_login,created_at,updated_at,closed_at,is_open,comments,author_association,labels_count,issue_age_days,time_to_close_hours
0,scikit-learn/scikit-learn,3604820995,32680,RFC: Should we remove the 'good first issue' l...,open,lucyleeow,2025-11-09 11:08:05+00:00,2025-11-10 00:16:58+00:00,NaT,True,2,MEMBER,1,3.0,
1,scikit-learn/scikit-learn,3604361646,32678,⚠️ CI failed on Linux_Runs.pylatest_conda_forg...,closed,scikit-learn-bot,2025-11-09 02:35:37+00:00,2025-11-09 12:14:17+00:00,2025-11-09 12:14:10+00:00,False,1,CONTRIBUTOR,1,0.0,9.642500
2,scikit-learn/scikit-learn,3603269688,32675,Feature Request: Add explained variance ratios...,open,paucablop,2025-11-08 09:40:11+00:00,2025-11-08 09:47:54+00:00,NaT,True,0,NONE,2,4.0,
3,scikit-learn/scikit-learn,3601823994,32671,load_iris example does not match text,closed,steveire,2025-11-07 20:02:31+00:00,2025-11-09 08:01:23+00:00,2025-11-09 08:01:23+00:00,False,3,NONE,1,1.0,35.981111
4,scikit-learn/scikit-learn,3595356150,32665,StandardScaler raises unclear error with empty...,open,Savithru7142,2025-11-06 11:19:14+00:00,2025-11-06 15:39:01+00:00,NaT,True,2,NONE,1,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,apache/airflow,3567902633,57515,KubernetesPodOperator Test test_async_write_lo...,open,jscheffl,2025-10-29 21:09:06+00:00,2025-10-29 21:14:00+00:00,NaT,True,0,CONTRIBUTOR,4,14.0,
496,apache/airflow,3567766742,57512,CloudComposerDAGRunSensor,open,Git4Vishal,2025-10-29 20:32:47+00:00,2025-10-31 10:43:59+00:00,NaT,True,2,NONE,5,14.0,
497,apache/airflow,3566934352,57507,Error when email_on_failure is enabled due to ...,closed,karenbraganz,2025-10-29 16:42:44+00:00,2025-11-05 17:32:45+00:00,2025-11-05 17:32:45+00:00,False,4,COLLABORATOR,7,7.0,168.833611
498,apache/airflow,3566777445,57498,Task-level params in HITLOperator are merged w...,open,jgoedeke,2025-10-29 16:03:44+00:00,2025-10-30 09:43:17+00:00,NaT,True,4,NONE,3,14.0,


# Save the cleaned and wrangled data

In [33]:
#define save methd:

def save_data(dataframe, filename):
    """Save data to csv file in the data/clean directory"""
    clean_data_dir = Path('../data/cleaned')
    clean_data_dir.mkdir(parents=True, exist_ok=True)

    filepath = clean_data_dir/filename

    dataframe.to_csv(filepath, index=False)

    print(f"Saved to {filepath}")




In [34]:
# Use save_data method to save all the data:

save_data(commits_df, 'cleaned_commits.csv')

save_data(issues_df, 'cleaned_issues.csv')

save_data(info_df, 'cleaned_info.csv')

Saved to ../data/cleaned/cleaned_commits.csv
Saved to ../data/cleaned/cleaned_issues.csv
Saved to ../data/cleaned/cleaned_info.csv
