In [5]:
# Cell 1: Import Libraries and Configure Notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
from datetime import datetime
from scipy.stats import spearmanr, ttest_ind, f_oneway

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
warnings.filterwarnings('ignore')

# Set seaborn style for plots
sns.set(style='whitegrid', context='notebook', palette='deep')
%matplotlib inline

In [None]:
# Cell 2: Data Loading and JSON Parsing Functions

def parse_json_column(val):
    """Try to parse a JSON string; if it fails, return the original value."""
    try:
        # Some fields may be empty strings; treat them as NaN
        if pd.isna(val) or (isinstance(val, str) and val.strip()==''):
            return np.nan
        return json.loads(val)
    except Exception:
        return val

def load_and_process_data(file_path):
    """
    Load the artifact CSV data, convert date fields,
    and parse columns that are stored as JSON strings.
    """
    # Read the CSV file
    df = pd.read_csv(file_path, low_memory=False)
    
    # Convert date/time columns (if present)
    for col in ['_created', '_updated', 'merged_at']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Identify columns that hold JSON strings
    json_cols = ['_links', 'failed_job', 'passed_job', 'metrics', 'reproducibility_status']
    for col in json_cols:
        if col in df.columns:
            df[col] = df[col].apply(parse_json_column)
    
    # Convert numeric fields (if any appear as numbers in string form)
    numeric_cols = ['pr_num', 'reproduce_attempts', 'reproduce_successes']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# Change file_path as needed
file_path = '/Users/harshil/Developer/GitHub_Repos/FailFix/Data/processed/artifact_data_table_44.csv'
df = load_and_process_data(file_path)
print("Data loaded. Shape:", df.shape)
df.head()

Data loaded. Shape: (3324, 33)


Unnamed: 0,_created,_deleted,_etag,_id,_links,_updated,added_version,base_branch,branch,build_system,cached,ci_service,current_image_tag,deprecated_version,failed_job,filtered_reason,image_tag,is_error_pass,lang,match,merged_at,metrics,passed_job,pr_num,repo,repo_mined_version,reproduce_attempts,reproduce_successes,reproduced,reproducibility_status,stability,status,test_framework
0,2018-08-24 08:01:18+00:00,False,f0ba3f7059f911957b0c99ba4b82c23455bbc734,5b7fbb4e37be5b494c9f9f0b,"{'collection': {'href': 'artifacts', 'title': 'artifacts'}, 'next': {'href': 'artifacts/wbond-package_control_channel-379589051?page=2', 'title': 'next page'}, 'parent': {'href': '/', 'title': 'home'}, 'self': {'href': 'artifacts/wbond-package_control_channel-379589051', 'title': 'Artifact'}}",2024-07-02 16:30:43+00:00,1.0.0,master,master,,True,travis,wbond-package_control_channel-379589051,,"{'base_sha': '93f88173b6f55ce9a46c2e56dc1eeb8d0da2243b', 'build_id': 379589050, 'build_job': '14497.1', 'committed_at': '2018-05-16T07:40:38Z', 'component_versions': {'analyzer': '96ba284ceef7e8732a2cfd6b769b337c4479f487', 'reproducer': 'ebdf7a233e8d2d850acf9c5c8ab031f11421eef5'}, 'config': {'': {'result': 'configured'}, 'dist': 'trusty', 'group': 'stable', 'language': 'python', 'os': 'linux', 'python': '3.3', 'script': ['python -m unittest', 'curl -X POST https://packagecontrol.io/test_pr/$TRAVIS_PULL_REQUEST.json'], 'sudo': False}, 'failed_tests': 'test_repository_package_names('./repository/c.json', ...) (tests.test.DefaultRepositoryTests)', 'is_git_repo': True, 'job_id': 379589051, 'message': 'retitle Chrome REPL > ChromeREPL to fix imports', 'mismatch_attrs': [], 'num_tests_failed': 1, 'num_tests_run': 8451, 'patches': {}, 'trigger_sha': '9a4cd38cf59908e1be6f75ce695ccc0a146250aa'}",,wbond-package_control_channel-379589051,False,Python,,2018-05-17 13:11:19+00:00,"{'additions': 43, 'changes': 52, 'deletions': 9, 'num_of_changed_files': 5}","{'base_sha': 'acbd921114ac46d62e86fd2cf98a06a156627d47', 'build_id': 380083244, 'build_job': '14504.1', 'committed_at': '2018-05-17T08:06:11Z', 'component_versions': {'analyzer': '96ba284ceef7e8732a2cfd6b769b337c4479f487', 'reproducer': 'ebdf7a233e8d2d850acf9c5c8ab031f11421eef5'}, 'config': {'': {'result': 'configured'}, 'dist': 'trusty', 'group': 'stable', 'language': 'python', 'os': 'linux', 'python': '3.3', 'script': ['python -m unittest', 'curl -X POST https://packagecontrol.io/test_pr/$TRAVIS_PULL_REQUEST.json'], 'sudo': False}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 380083245, 'message': 'alphabetize ChromeREPL', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 8457, 'patches': {}, 'trigger_sha': 'f66141cda9a4149e962c5d4da5d6c8f96bd0f85c'}",7113,wbond/package_control_channel,efabcb148a97bb217fbd0c75655bd885bf6ec06d,5.0,5.0,False,"{'status': 'Reproducible', 'time_stamp': '2018-08-24'}",5/5,active,unittest
1,2018-03-06 04:13:22+00:00,False,4349909dfe62bd5dd30c5b1b24951dbdc14a4f31,5a9e1562f1f70e072f3a6672,"{'collection': {'href': 'artifacts', 'title': 'artifacts'}, 'next': {'href': 'artifacts/checkstyle-checkstyle-248927615?page=2', 'title': 'next page'}, 'parent': {'href': '/', 'title': 'home'}, 'self': {'href': 'artifacts/checkstyle-checkstyle-248927615', 'title': 'Artifact'}}",2024-07-02 16:29:29+00:00,1.0.0,,master,Maven,True,travis,checkstyle-checkstyle-248927615,,"{'base_sha': '', 'build_id': 248927610, 'build_job': '11011.5', 'committed_at': '2017-06-30T20:23:46Z', 'config': {'': {'result': 'configured'}, 'addons': {'apt': {'packages': ['xsltproc', 'xmlstarlet', 'oracle-java8-installer']}}, 'after_success': ['set -e if [[ -n $CMD_AFTER_SUCCESS  && $SKIP_CI == 'false'  ]]; then  eval $CMD_AFTER_SUCCESS;  echo ""CMD_AFTER_SUCCESS is finished""; fi ', 'set -e SKIP_DEPLOY=$(if [ $(git log -1 | grep -E ""\[maven-release-plugin\] prepare release"" | cat | wc -l) -lt 1 ]; then echo false; else echo true; fi;) if [[ $TRAVIS_REPO_SLUG == 'checkstyle/checkstyle'  && $TRAVIS_BRANCH == 'master'  && $TRAVIS_PULL_REQUEST == 'false'  && $DEPLOY == 'true'  && $SKIP_CI == 'false'  && $SKIP_DEPLOY == 'false'  ]]; then  mvn -s config/deploy-settings.xml -Pno-validations deploy;  echo ""deploy to maven snapshot repository is finished""; fi '], 'branches': {'only': ['master']}, 'cache': {'apt': True, 'directories': ['~/.m2']}, 'dist': 'precise', 'env': 'DESC=""findbugs, spotbugs and pmd"" CMD=""mvn clean compile pmd:check findbugs:check spotbugs:check""', 'group': 'stable', 'install': ['if [ ""${TRAVIS_OS_NAME}"" == ""osx"" ]; then  # https://github.com/travis-ci/travis-ci/issues/6307#issuecomment-233315824  rvm get head fi '], 'jdk': 'oraclejdk8', 'language': 'java', 'os': 'linux', 'script': ['SKIP_FILES="".github|appveyor.yml|circle.yml|distelli-manifest.yml|fast-forward-merge.sh|LICENSE|LICENSE.apache20|README.md|release.sh|RIGHTS.antlr|shippable.yml|wercker.yml""', 'SKIP_CI=$(if [[ $(git diff --name-only HEAD HEAD~1 | grep -vE ""$SKIP_FILES"" | cat | wc -c | sed 's/^ *//' ) > 0 ]]; then echo false; else echo true; fi;)', 'echo ""SKIP_CI=""$SKIP_CI', 'set -e if [[ $SKIP_CI == 'false' ]]; then  eval $CMD;  echo ""eval of CMD is completed"" fi '], 'sudo': False}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 248927615, 'message': 'Issue #3110: fix localization for TranslationCheckTest and IllegalInstantiationCheckTest', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 0, 'patches': {'casher': '2021-01-05', 'mvn-tls': '2020-03-04', 'ubuntu-precise-apt-links': '2021-06-15'}, 'trigger_sha': '70ac14612d1f1cd588db6ef75f6f9ab870909e8d'}",,checkstyle-checkstyle-248927615,True,Java,1.0,NaT,"{'additions': 6, 'changes': 10, 'deletions': 4, 'num_of_changed_files': 2}","{'base_sha': '', 'build_id': 248934020, 'build_job': '11012.5', 'committed_at': '2017-06-30T20:44:02Z', 'config': {'': {'result': 'configured'}, 'addons': {'apt': {'packages': ['xsltproc', 'xmlstarlet', 'oracle-java8-installer']}}, 'after_success': ['set -e if [[ -n $CMD_AFTER_SUCCESS  && $SKIP_CI == 'false'  ]]; then  eval $CMD_AFTER_SUCCESS;  echo ""CMD_AFTER_SUCCESS is finished""; fi ', 'set -e SKIP_DEPLOY=$(if [ $(git log -1 | grep -E ""\[maven-release-plugin\] prepare release"" | cat | wc -l) -lt 1 ]; then echo false; else echo true; fi;) if [[ $TRAVIS_REPO_SLUG == 'checkstyle/checkstyle'  && $TRAVIS_BRANCH == 'master'  && $TRAVIS_PULL_REQUEST == 'false'  && $DEPLOY == 'true'  && $SKIP_CI == 'false'  && $SKIP_DEPLOY == 'false'  ]]; then  mvn -s config/deploy-settings.xml -Pno-validations deploy;  echo ""deploy to maven snapshot repository is finished""; fi '], 'branches': {'only': ['master']}, 'cache': {'apt': True, 'directories': ['~/.m2']}, 'dist': 'precise', 'env': 'DESC=""findbugs, spotbugs and pmd"" CMD=""mvn clean compile pmd:check findbugs:check spotbugs:check""', 'group': 'stable', 'install': ['if [ ""${TRAVIS_OS_NAME}"" == ""osx"" ]; then  # https://github.com/travis-ci/travis-ci/issues/6307#issuecomment-233315824  rvm get head fi '], 'jdk': 'oraclejdk8', 'language': 'java', 'os': 'linux', 'script': ['SKIP_FILES="".github|appveyor.yml|circle.yml|distelli-manifest.yml|fast-forward-merge.sh|LICENSE|LICENSE.apache20|README.md|release.sh|RIGHTS.antlr|shippable.yml|wercker.yml""', 'SKIP_CI=$(if [[ $(git diff --name-only HEAD HEAD~1 | grep -vE ""$SKIP_FILES"" | cat | wc -c | sed 's/^ *//' ) > 0 ]]; then echo false; else echo true; fi;)', 'echo ""SKIP_CI=""$SKIP_CI', 'set -e if [[ $SKIP_CI == 'false' ]]; then  eval $CMD;  echo ""eval of CMD is completed"" fi '], 'sudo': False}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 248934029, 'message': 'minor: fix for checkstyle violations caused by previous commits', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 0, 'patches': {'casher': '2021-01-05', 'mvn-tls': '2020-03-04', 'ubuntu-precise-apt-links': '2021-06-15'}, 'trigger_sha': 'aef6d33f378e158b0174b29e7bf199b22b5c5426'}",-1,checkstyle/checkstyle,810b5e9ef81fb40d4f88f9dae86b930b2735e431,5.0,5.0,True,"{'status': 'Reproducible', 'time_stamp': '2018-03-06'}",5/5,active,
2,2024-08-07 18:02:14+00:00,False,00cdf9f45087d66089266c98c973a68807f07b99,66b3b6a669c5cc913dc63b42,"{'collection': {'href': 'artifacts', 'title': 'artifacts'}, 'next': {'href': 'artifacts/AgentOps-AI-agentops-26732387115?page=2', 'title': 'next page'}, 'parent': {'href': '/', 'title': 'home'}, 'self': {'href': 'artifacts/AgentOps-AI-agentops-26732387115', 'title': 'Artifact'}}",2024-09-04 00:03:49+00:00,1.2.6,,issue-254-fix,,True,github,AgentOps-AI-agentops-26732387115,,"{'base_sha': '69f856715f49e0096dd2e19b92a44dcd578e0843', 'build_id': 9687549729, 'build_job': '434.5', 'committed_at': '2024-06-26T22:21:26Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'build', 'runs-on': 'ubuntu-latest', 'steps': [{'uses': 'actions/checkout@v2'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v2', 'with': {'python-version': '3.11'}}, {'name': 'Install tox', 'run': 'pip install tox'}, {'name': 'Run tests with tox', 'run': 'tox'}], 'strategy': {'matrix': {'python-version': 3.11}}}, 'failed_tests': '(tests.test_canary)#(tests.test_events)#(tests.test_host_env)#(tests.test_record_function)#(tests.test_session)#(tests.test_teardown)#(tests.test_canary)#(tests.test_events)#(tests.test_host_env)#(tests.test_record_function)#(tests.test_session)#(tests.test_teardown)', 'is_git_repo': True, 'job_id': 26732387115, 'message': 'Plumbing Models', 'mismatch_attrs': [], 'num_tests_failed': 12, 'num_tests_run': 12, 'patches': {'remove-ppa': '2024-08-07'}, 'trigger_sha': 'b2378c291123e6ae2de58979bbf69feacf11c6b3'}",,AgentOps-AI-agentops-26732387115,False,Python,1.0,NaT,"{'additions': 3, 'changes': 4, 'deletions': 1, 'num_of_changed_files': 1}","{'base_sha': '69f856715f49e0096dd2e19b92a44dcd578e0843', 'build_id': 9687739109, 'build_job': '435.5', 'committed_at': '2024-06-26T22:39:59Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'build', 'runs-on': 'ubuntu-latest', 'steps': [{'uses': 'actions/checkout@v2'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v2', 'with': {'python-version': '3.11'}}, {'name': 'Install tox', 'run': 'pip install tox'}, {'name': 'Run tests with tox', 'run': 'tox'}], 'strategy': {'matrix': {'python-version': 3.11}}}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 26732915332, 'message': 'Merge branch 'issue-254-test-fix' into issue-254-fix', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 26, 'patches': {'remove-ppa': '2024-08-07'}, 'trigger_sha': '8ee2218df117229b84a603b24d1f1bd7a5362629'}",269,AgentOps-AI/agentops,640ca65c00b74ef7b3db9a9e723c77451e349d50,3.0,3.0,True,"{'status': 'Reproducible', 'time_stamp': '2024-08-07'}",3/3,active,pytest
3,2024-08-17 00:50:08+00:00,False,34a1e44765a5f047a387e256fd2580a7ad8a2821,66bff3c04657d4e8877e0b36,"{'collection': {'href': 'artifacts', 'title': 'artifacts'}, 'next': {'href': 'artifacts/PyGithub-PyGithub-28064865291?page=2', 'title': 'next page'}, 'parent': {'href': '/', 'title': 'home'}, 'self': {'href': 'artifacts/PyGithub-PyGithub-28064865291', 'title': 'Artifact'}}",2024-09-04 00:04:01+00:00,1.2.6,,main,,True,github,PyGithub-PyGithub-28064865291,,"{'base_sha': '', 'build_id': 10149595758, 'build_job': '3509.4', 'committed_at': '2024-07-29T18:12:40Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'test', 'name': 'test (Python ${{ matrix.python-version }} on ${{ matrix.os-label }})', 'runs-on': '${{ matrix.os }}', 'steps': [{'uses': 'actions/checkout@v3'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v4', 'with': {'python-version': '${{ matrix.python-version }}'}}, {'name': 'Install tox', 'run': 'python -m pip install --upgrade pip\npip install tox tox-gh-actions\n\n'}, {'name': 'Run tests', 'run': 'tox'}, {'name': 'Upload coverage to Codecov', 'uses': 'codecov/codecov-action@v3'}], 'strategy': {'fail-fast': False, 'matrix': {'os': 'ubuntu-latest', 'os-label': 'Ubuntu', 'python-version': '3.11'}}}, 'failed_tests': 'tests.Repository::Repository::testAttributes#tests.Repository::Repository::testEditWithAllArguments', 'is_git_repo': True, 'job_id': 28064865291, 'message': 'Add has_discussions field to Repository class (#2995) Add property and configuration of GitHub discussions per repository. Fix #2994 --------- Co-authored-by: Enrico Minack <github@enrico.minack.dev>', 'mismatch_attrs': [], 'num_tests_failed': 2, 'num_tests_run': 933, 'patches': {'remove-ppa': '2024-08-16'}, 'trigger_sha': '7213cd05a741bf2c4b5dbddc42d3192f91156865'}",,PyGithub-PyGithub-28064865291,False,Python,1.0,NaT,"{'additions': 0, 'changes': 13, 'deletions': 13, 'num_of_changed_files': 2}","{'base_sha': '', 'build_id': 10150032625, 'build_job': '3512.4', 'committed_at': '2024-07-29T18:46:41Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'test', 'name': 'test (Python ${{ matrix.python-version }} on ${{ matrix.os-label }})', 'runs-on': '${{ matrix.os }}', 'steps': [{'uses': 'actions/checkout@v3'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v4', 'with': {'python-version': '${{ matrix.python-version }}'}}, {'name': 'Install tox', 'run': 'python -m pip install --upgrade pip\npip install tox tox-gh-actions\n\n'}, {'name': 'Run tests', 'run': 'tox'}, {'name': 'Upload coverage to Codecov', 'uses': 'codecov/codecov-action@v3'}], 'strategy': {'fail-fast': False, 'matrix': {'os': 'ubuntu-latest', 'os-label': 'Ubuntu', 'python-version': '3.11'}}}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 28066236404, 'message': 'Revert ""Add has_discussions field to Repository class"" (#3009) Reverts PyGithub/PyGithub#2995', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 933, 'patches': {'remove-ppa': '2024-08-16'}, 'trigger_sha': '38197d64e63b7a5d0b814b6b892a5cfc5a90159d'}",-1,PyGithub/PyGithub,38197d64e63b7a5d0b814b6b892a5cfc5a90159d,3.0,3.0,True,"{'status': 'Reproducible', 'time_stamp': '2024-08-16'}",3/3,active,pytest
4,2024-08-21 23:34:14+00:00,False,34c9b382c95e26ea2c1a6b02ce6a2d94d901f915,66c67976a098d2fc289667ec,"{'collection': {'href': 'artifacts', 'title': 'artifacts'}, 'next': {'href': 'artifacts/weaveworks-grafanalib-27865422011?page=2', 'title': 'next page'}, 'parent': {'href': '/', 'title': 'home'}, 'self': {'href': 'artifacts/weaveworks-grafanalib-27865422011', 'title': 'Artifact'}}",2024-09-04 00:04:04+00:00,1.2.6,,update-dashlist-panel,,True,github,weaveworks-grafanalib-27865422011,,"{'base_sha': '5c3b17edaa437f0bc09b5f1b9275dc8fb91689fb', 'build_id': 10079041985, 'build_job': '1096.1', 'committed_at': '2024-07-24T14:55:41Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'build-n-publish', 'name': 'Run tests - Python ${{ matrix.python }}', 'runs-on': 'ubuntu-20.04', 'steps': [{'uses': 'actions/checkout@v4'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v5.1.1', 'with': {'python-version': '${{ matrix.python }}'}}, {'name': 'Run tests', 'run': 'pip3 install tox flake8\nmake deps\nmake all\n\n'}], 'strategy': {'matrix': {'python': '3.8'}}}, 'failed_tests': 'grafanalib.tests.test_core::test_dashboard_list', 'is_git_repo': True, 'job_id': 27865422011, 'message': 'feat: Add filtering by folder to DashboardList', 'mismatch_attrs': [], 'num_tests_failed': 1, 'num_tests_run': 95, 'patches': {'remove-ppa': '2024-08-21'}, 'trigger_sha': '9ded8636fa24f64164406ecb1e4ae758f5b6ffdf'}",,weaveworks-grafanalib-27865422011,False,Python,1.0,NaT,"{'additions': 2, 'changes': 3, 'deletions': 1, 'num_of_changed_files': 2}","{'base_sha': '5c3b17edaa437f0bc09b5f1b9275dc8fb91689fb', 'build_id': 10079174950, 'build_job': '1097.1', 'committed_at': '2024-07-24T15:04:58Z', 'component_versions': {'analyzer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832', 'reproducer': '48b0fdbb1cc40cfba95c3b27f9aec204217e5832'}, 'config': {'id-in-workflow': 'build-n-publish', 'name': 'Run tests - Python ${{ matrix.python }}', 'runs-on': 'ubuntu-20.04', 'steps': [{'uses': 'actions/checkout@v4'}, {'name': 'Set up Python', 'uses': 'actions/setup-python@v5.1.1', 'with': {'python-version': '${{ matrix.python }}'}}, {'name': 'Run tests', 'run': 'pip3 install tox flake8\nmake deps\nmake all\n\n'}], 'strategy': {'matrix': {'python': '3.8'}}}, 'failed_tests': '', 'is_git_repo': True, 'job_id': 27865894757, 'message': 'chore: Update docs, tests for DashboardList', 'mismatch_attrs': [], 'num_tests_failed': 0, 'num_tests_run': 190, 'patches': {'remove-ppa': '2024-08-21'}, 'trigger_sha': '9c865897c59e6b11b96a80c994cf9201a162f56c'}",670,weaveworks/grafanalib,5c3b17edaa437f0bc09b5f1b9275dc8fb91689fb,3.0,3.0,True,"{'status': 'Reproducible', 'time_stamp': '2024-08-21'}",3/3,active,pytest


In [7]:
# Cell 3: Basic Data Exploration and Quality Report

def generate_data_quality_report(df):
    total_rows = len(df)
    
    # Missing value report
    missing_data = pd.DataFrame({
        'Missing Count': df.isnull().sum(),
        'Missing Percentage': (df.isnull().sum()/total_rows*100).round(2)
    })
    
    # Categorical columns to inspect
    categorical_cols = ['lang', 'build_system', 'ci_service', 'status', 'test_framework']
    value_distributions = {col: df[col].value_counts().head(10) for col in categorical_cols if col in df.columns}
    
    return missing_data, value_distributions

missing_report, cat_distributions = generate_data_quality_report(df)
print("Missing Data Report:")
print(missing_report)
print("\nValue Distributions for Key Categorical Columns:")
for col, dist in cat_distributions.items():
    print(f"\n{col}:")
    print(dist)

Missing Data Report:
                        Missing Count  Missing Percentage
_created                            0                0.00
_deleted                            0                0.00
_etag                               0                0.00
_id                                 0                0.00
_links                              0                0.00
_updated                            0                0.00
added_version                       0                0.00
base_branch                      2629               79.09
branch                              0                0.00
build_system                     1264               38.03
cached                              0                0.00
ci_service                          0                0.00
current_image_tag                   0                0.00
deprecated_version               3324              100.00
failed_job                          0                0.00
filtered_reason                  3324              