In [8]:
# Notebooks/artifact-data-check.ipynb

# Cell 1: Load Data & Basic Overview
import pandas as pd

# Update the CSV path if needed
csv_file = "/Users/harshil/Developer/GitHub_Repos/FailFix/Data/processed/artifact_data_table_1.csv"
df = pd.read_csv(csv_file)

print("=== Basic Overview ===")
print(f"DataFrame Shape: {df.shape}\n")
print("First 5 rows:")
print(df.head(), "\n")
print("DataFrame Info:")
df.info()

=== Basic Overview ===
DataFrame Shape: (2552, 33)

First 5 rows:
               _created  _deleted                                     _etag  \
0  2018-08-24T08:01:18Z     False  f0ba3f7059f911957b0c99ba4b82c23455bbc734   
1  2018-03-06T04:13:22Z     False  4349909dfe62bd5dd30c5b1b24951dbdc14a4f31   
2  2024-08-07T18:02:14Z     False  00cdf9f45087d66089266c98c973a68807f07b99   
3  2018-08-23T19:54:33Z     False  e34b1d72fc1818418aee37f14d5ba24955b2f6e0   
4  2018-03-08T04:06:54Z     False  21036a72909701d2476809566e99151a89ef7907   

                        _id  \
0  5b7fbb4e37be5b494c9f9f0b   
1  5a9e1562f1f70e072f3a6672   
2  66b3b6a669c5cc913dc63b42   
3  5b7f10f937be5b494c9f9bf3   
4  5aa0b6def1f70e0f2d0ce13e   

                                              _links              _updated  \
0  {"collection": {"href": "artifacts", "title": ...  2024-07-02T16:30:43Z   
1  {"collection": {"href": "artifacts", "title": ...  2024-07-02T16:29:29Z   
2  {"collection": {"href": "artifacts"

In [9]:
# Cell 2: Summary Statistics & Missing Values
print("\n=== Summary Statistics for Numeric Columns ===")
print(df.describe())

print("\n=== Missing Values per Column ===")
print(df.isnull().sum())


=== Summary Statistics for Numeric Columns ===
       deprecated_version  filtered_reason        match        pr_num  \
count                 0.0              0.0  2003.000000   2552.000000   
mean                  NaN              NaN     0.999501    924.192790   
std                   NaN              NaN     0.022344   2191.062655   
min                   NaN              NaN     0.000000     -1.000000   
25%                   NaN              NaN     1.000000     -1.000000   
50%                   NaN              NaN     1.000000     -1.000000   
75%                   NaN              NaN     1.000000    331.000000   
max                   NaN              NaN     1.000000  13830.000000   

       reproduce_attempts  reproduce_successes  
count         2552.000000          2552.000000  
mean             4.833856             4.795846  
std              0.550669             0.633148  
min              3.000000             1.000000  
25%              5.000000             5.000000  


In [3]:
# Cell 3: Check for Duplicate Rows
num_duplicates = df.duplicated().sum()
print("\n=== Duplicate Rows Check ===")
print(f"Number of duplicate rows: {num_duplicates}")


=== Duplicate Rows Check ===
Number of duplicate rows: 0


In [4]:
# Cell 4: Validate Timestamp Columns
from datetime import datetime

def parse_timestamp(ts_str):
    """Attempt to parse an ISO 8601 timestamp (format: YYYY-MM-DDTHH:MM:SSZ)."""
    try:
        return datetime.strptime(ts_str, '%Y-%m-%dT%H:%M:%SZ')
    except Exception:
        return None

# Validate 'failed_commit' timestamps
failed_parsed = df['failed_commit'].dropna().apply(parse_timestamp)
invalid_failed = failed_parsed.isnull().sum()

# Validate 'passed_commit' timestamps
passed_parsed = df['passed_commit'].dropna().apply(parse_timestamp)
invalid_passed = passed_parsed.isnull().sum()

print("\n=== Timestamp Validation ===")
print(f"Total 'failed_commit' entries (non-null): {df['failed_commit'].dropna().shape[0]}")
print(f"Invalid 'failed_commit' timestamps: {invalid_failed}")
print(f"Total 'passed_commit' entries (non-null): {df['passed_commit'].dropna().shape[0]}")
print(f"Invalid 'passed_commit' timestamps: {invalid_passed}")


=== Timestamp Validation ===
Total 'failed_commit' entries (non-null): 2529
Invalid 'failed_commit' timestamps: 0
Total 'passed_commit' entries (non-null): 2529
Invalid 'passed_commit' timestamps: 0


In [5]:
# Cell 5: Analyze the 'time_to_fix_hours' Column
print("\n=== 'time_to_fix_hours' Summary Statistics ===")
print(df['time_to_fix_hours'].describe())

# Check for negative time values (which may be unexpected)
negative_time_count = df[df['time_to_fix_hours'] < 0].shape[0]
print(f"\nNumber of negative 'time_to_fix_hours' entries: {negative_time_count}")

# Identify potential outliers (for example, values that are very high)
# Here we compute the 99th percentile as one reference
percentile_99 = df['time_to_fix_hours'].quantile(0.99)
print(f"99th percentile of 'time_to_fix_hours': {percentile_99:.2f} hours")


=== 'time_to_fix_hours' Summary Statistics ===
count    2529.000000
mean       22.854815
std       200.418925
min         0.000556
25%         0.135278
50%         0.453611
75%         3.701667
max      5058.024444
Name: time_to_fix_hours, dtype: float64

Number of negative 'time_to_fix_hours' entries: 0
99th percentile of 'time_to_fix_hours': 318.98 hours


In [7]:
# Cell 6: Explore Categorical Variables (Language & Exceptions)
print("\n=== Unique Languages & Their Counts ===")
print(df['lang'].value_counts())

print("\n=== Unique Exceptions (Non-null) ===")
# Show unique exception strings from the 'exceptions' column (if any)
unique_exceptions = df['exceptions'].dropna().unique()
print(unique_exceptions)


=== Unique Languages & Their Counts ===
lang
Java      1758
Python     771
Name: count, dtype: int64

=== Unique Exceptions (Non-null) ===
['AssertionError' 'ImportError' 'NullPointerException' 'AssertionFailure'
 'ComparisonFailure' 'AttributeError'
 'InterruptedIOException;RetrofitError' 'WantedButNotInvoked'
 'JSONException;InvocationTargetException;IllegalArgumentException'
 'ValueError' 'PipeRunException;NullPointerException;AssertionFailedError'
 'AssertionError;NullPointerException;ComparisonFailure'
 'ImportError;ValueError' 'ArrayIndexOutOfBoundsException'
 'BusinessServiceException' 'AssertionFailedError'
 'AssertionError;IllegalArgumentException' 'URLError;UnboundLocalError'
 'ImportError;ModuleNotFoundError'
 'DuplicateMappingException;BeanCreationException;IllegalStateException'
 'NullPointerException;BuilderException;PersistenceException;Exception'
 'SQLFeatureNotSupportedException' 'StringIndexOutOfBoundsException'
 'IllegalStateException' 'UnboundLocalError' 'RuntimeEr