In [1]:
# Notebooks/analyze-artifact-data.ipynb

# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure display options for pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Set seaborn aesthetic parameters
sns.set(style='whitegrid', context='notebook', palette='deep')

# Enable inline plotting for Jupyter Notebook
%matplotlib inline

In [2]:
# Cell 2: Load and Preview the Data

# Define the path to the CSV file
data_path = '/Users/harshil/Developer/GitHub_Repos/FailFix/Data/processed/artifact_data_final.csv'

# Load the data into a pandas DataFrame
df = pd.read_csv(data_path)

# Display the first five rows of the DataFrame
df.head()


Unnamed: 0,repo,lang,build_system,test_framework,ci_service,image_tag,failed_commit,passed_commit,time_to_fix_hours,classification_code,classification_test,classification_build,exceptions,failed_tests,num_tests_failed,num_tests_run,metrics_additions,metrics_deletions,metrics_changes,metrics_num_files_changed
0,wbond/package_control_channel,Python,,unittest,travis,wbond-package_control_channel-379589051,2018-05-16T07:40:38Z,2018-05-17T08:06:11Z,24.425833,No,No,No,AssertionError,"test_repository_package_names('./repository/c.json', ...) (tests.test.DefaultRepositoryTests)",1,8451,43,9,52,5
1,checkstyle/checkstyle,Java,Maven,,travis,checkstyle-checkstyle-248927615,2017-06-30T20:23:46Z,2017-06-30T20:44:02Z,0.337778,No,Yes,No,,,0,0,6,4,10,2
2,square/retrofit,Java,Maven,JUnit,travis,square-retrofit-65662934,2015-06-06T04:35:33Z,2015-06-08T02:55:01Z,46.324444,Partial,Partial,No,AssertionError,typeVariableNoBoundThrows(retrofit.RestAdapterTest)#typeVariableUpperBoundThrows(retrofit.RestAdapterTest)#typeVariableNestedThrows(retrofit.RestAdapterTest),3,119,80,68,148,4
3,traccar/traccar,Java,Maven,JUnit,travis,tananaev-traccar-213175603,2017-03-20T13:42:22Z,2017-03-20T13:45:07Z,0.045833,No,Yes,No,AssertionError,testDecode(org.traccar.protocol.Gl200ProtocolDecoderTest),1,237,1,1,2,1
4,SonarSource/sonar-php,Java,Maven,JUnit,travis,SonarSource-sonar-php-206563528,2017-03-01T11:04:23Z,2017-03-01T14:17:00Z,3.210278,No,Yes,No,NullPointerException,shouldReportStatusCounts(org.sonar.plugins.php.phpunit.PhpUnitTestFileReportTest)#shouldReportZeroTestsIfEmpty(org.sonar.plugins.php.phpunit.PhpUnitTestFileReportTest)#shouldNotCountSkippedTests(org.sonar.plugins.php.phpunit.PhpUnitTestFileReportTest)#shouldReportNoSuccessDensityIfNoLiveTests(org.sonar.plugins.php.phpunit.PhpUnitTestFileReportTest),4,697,4,11,15,3


In [3]:
# Cell 3: Data Overview and Summary Statistics

# Get the shape of the dataset
num_rows, num_cols = df.shape
print(f"The dataset contains {num_rows} rows and {num_cols} columns.\n")

# Display data types of each column
print("Data Types:\n")
print(df.dtypes)
print("\n")

# Generate summary statistics for numerical columns
numerical_cols = ['time_to_fix_hours', 'num_tests_failed', 'num_tests_run', 
                  'metrics_additions', 'metrics_deletions', 'metrics_changes', 
                  'metrics_num_files_changed']

print("Summary Statistics for Numerical Columns:\n")
df[numerical_cols].describe().transpose()

The dataset contains 2208 rows and 20 columns.

Data Types:

repo                          object
lang                          object
build_system                  object
test_framework                object
ci_service                    object
image_tag                     object
failed_commit                 object
passed_commit                 object
time_to_fix_hours            float64
classification_code           object
classification_test           object
classification_build          object
exceptions                    object
failed_tests                  object
num_tests_failed               int64
num_tests_run                  int64
metrics_additions              int64
metrics_deletions              int64
metrics_changes                int64
metrics_num_files_changed      int64
dtype: object


Summary Statistics for Numerical Columns:



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
time_to_fix_hours,2208.0,24.99648,213.945049,0.000556,0.134722,0.463472,4.163403,5058.024444
num_tests_failed,2208.0,4.799819,68.261217,0.0,0.0,0.0,1.0,1821.0
num_tests_run,2208.0,1433.891304,2504.996313,0.0,0.0,47.0,1596.0,10006.0
metrics_additions,2208.0,112.811141,533.042384,0.0,1.0,5.0,26.0,9603.0
metrics_deletions,2208.0,58.969656,565.229843,0.0,1.0,3.0,15.0,24203.0
metrics_changes,2208.0,171.780797,848.937848,0.0,3.0,9.0,42.0,24203.0
metrics_num_files_changed,2208.0,5.95154,19.270189,0.0,1.0,1.0,3.0,394.0


In [4]:
# Cell 4: Handling Missing Values and Data Cleaning

# Check for missing values in each column
missing_values = df.isnull().sum()
print("Missing Values in Each Column:\n")
print(missing_values)

# List of columns we expect to be numerical
numerical_cols = ['time_to_fix_hours', 'num_tests_failed', 'num_tests_run', 
                  'metrics_additions', 'metrics_deletions', 'metrics_changes', 
                  'metrics_num_files_changed']

# Convert numerical columns to numeric data types, handling errors
for col in numerical_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# After conversions, check for any remaining missing values in numericals
print("\nMissing Values After Conversion:\n")
print(df[numerical_cols].isnull().sum())

# Drop rows with missing 'time_to_fix_hours' (should be none based on preprocessing)
df = df.dropna(subset=['time_to_fix_hours'])

# Fill missing values in other numerical columns with zeros (if appropriate)
df[numerical_cols[1:]] = df[numerical_cols[1:]].fillna(0)

# Verify that there are no missing values left in numerical columns
print("\nFinal Check for Missing Values in Numerical Columns:\n")
print(df[numerical_cols].isnull().sum())

Missing Values in Each Column:

repo                            0
lang                            0
build_system                  679
test_framework               1054
ci_service                      0
image_tag                       0
failed_commit                   0
passed_commit                   0
time_to_fix_hours               0
classification_code             3
classification_test             3
classification_build            3
exceptions                   1111
failed_tests                 1163
num_tests_failed                0
num_tests_run                   0
metrics_additions               0
metrics_deletions               0
metrics_changes                 0
metrics_num_files_changed       0
dtype: int64

Missing Values After Conversion:

time_to_fix_hours            0
num_tests_failed             0
num_tests_run                0
metrics_additions            0
metrics_deletions            0
metrics_changes              0
metrics_num_files_changed    0
dtype: int64

Final Che

In [5]:
# Cell 5: Distribution of Time to Fix

# plt.figure(figsize=(12, 6))

# # Histogram of 'time_to_fix_hours' with Kernel Density Estimate (KDE)
# sns.histplot(df['time_to_fix_hours'], bins=50, kde=True, color='skyblue')

# plt.title('Distribution of Time to Fix (in Hours)')
# plt.xlabel('Time to Fix (Hours)')
# plt.ylabel('Number of Artifacts')
# plt.tight_layout()
# plt.show()

# Statistical summary
mean_fix_time = df['time_to_fix_hours'].mean()
median_fix_time = df['time_to_fix_hours'].median()
std_fix_time = df['time_to_fix_hours'].std()

print(f"Mean Time to Fix: {mean_fix_time:.2f} hours")
print(f"Median Time to Fix: {median_fix_time:.2f} hours")
print(f"Standard Deviation of Time to Fix: {std_fix_time:.2f} hours")

# Identify outliers using the IQR method
Q1 = df['time_to_fix_hours'].quantile(0.25)
Q3 = df['time_to_fix_hours'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[df['time_to_fix_hours'] > upper_bound]
num_outliers = outliers.shape[0]
percentage_outliers = (num_outliers / df.shape[0]) * 100

print(f"\nNumber of Outliers: {num_outliers} ({percentage_outliers:.2f}% of data)")

# Visualize the distribution with a boxplot
# plt.figure(figsize=(12, 2))
# sns.boxplot(x=df['time_to_fix_hours'], color='lightgreen')
# plt.title('Boxplot of Time to Fix (in Hours)')
# plt.xlabel('Time to Fix (Hours)')
# plt.tight_layout()
# plt.show()

Mean Time to Fix: 25.00 hours
Median Time to Fix: 0.46 hours
Standard Deviation of Time to Fix: 213.95 hours

Number of Outliers: 421 (19.07% of data)


In [6]:
# Cell 6: Impact of Exception Types on Time to Fix

# Ensure that 'exceptions' is of string type
df['exceptions'] = df['exceptions'].astype(str)

# Explode the 'exceptions' column
df_exceptions = df.copy()
df_exceptions['exceptions_list'] = df_exceptions['exceptions'].str.split(';')
df_exploded = df_exceptions.explode('exceptions_list')

# Remove any leading/trailing whitespace and filter out empty strings
df_exploded['exceptions_list'] = df_exploded['exceptions_list'].str.strip()
df_exploded = df_exploded[df_exploded['exceptions_list'] != '']

# Calculate average time to fix for each exception
exception_fix_time = df_exploded.groupby('exceptions_list')['time_to_fix_hours'].agg(['mean', 'count']).reset_index()
exception_fix_time.rename(columns={'mean': 'avg_time_to_fix', 'count': 'occurrences'}, inplace=True)

# Filter exceptions with a minimum number of occurrences to ensure statistical significance
min_occurrences = 5
exception_fix_time_filtered = exception_fix_time[exception_fix_time['occurrences'] >= min_occurrences]

# Sort exceptions by average time to fix
exception_fix_time_sorted = exception_fix_time_filtered.sort_values(by='avg_time_to_fix', ascending=False)

# Display the top 10 exceptions with the highest average time to fix
top_exceptions = exception_fix_time_sorted.head(10)
print("Top 10 Exceptions with Highest Average Time to Fix:\n")
print(top_exceptions)

# # Visualize the results
# plt.figure(figsize=(12, 6))
# sns.barplot(data=top_exceptions, x='avg_time_to_fix', y='exceptions_list', palette='viridis')

# plt.title('Top 10 Exceptions by Average Time to Fix')
# plt.xlabel('Average Time to Fix (Hours)')
# plt.ylabel('Exception Type')
# plt.tight_layout()
# plt.show()


Top 10 Exceptions with Highest Average Time to Fix:

                 exceptions_list  avg_time_to_fix  occurrences
84              RuntimeException       431.029333           10
46         IllegalStateException       138.443828           32
22             ComparisonFailure        55.828008           81
45      IllegalArgumentException        45.879400           31
6                 AssertionError        33.074116          513
81   RestClientResponseException        28.817944            5
119                          nan        17.555986         1111
62              NoSuchFieldError        13.084444            5
47                   ImportError        12.447172           22
8               AssertionFailure        11.232580           38


In [7]:
# Cell 7: Influence of Failure Classification on Time to Fix

# Create a combined classification column
df['classification_combined'] = (
    'Code: ' + df['classification_code'] + ', ' +
    'Test: ' + df['classification_test'] + ', ' +
    'Build: ' + df['classification_build']
)

# Calculate average time to fix for each classification
classification_fix_time = df.groupby('classification_combined')['time_to_fix_hours'].agg(['mean', 'count']).reset_index()
classification_fix_time.rename(columns={'mean': 'avg_time_to_fix', 'count': 'occurrences'}, inplace=True)

# Display the classifications sorted by average time to fix
classification_fix_time_sorted = classification_fix_time.sort_values(by='avg_time_to_fix', ascending=False)
print("Average Time to Fix by Failure Classification:\n")
print(classification_fix_time_sorted)

# # Visualize the results
# plt.figure(figsize=(14, 8))
# sns.barplot(data=classification_fix_time_sorted, x='avg_time_to_fix', y='classification_combined', palette='coolwarm')

# plt.title('Average Time to Fix by Failure Classification')
# plt.xlabel('Average Time to Fix (Hours)')
# plt.ylabel('Failure Classification')
# plt.tight_layout()
# plt.show()


Average Time to Fix by Failure Classification:

                         classification_combined  avg_time_to_fix  occurrences
9   Code: Partial, Test: Partial, Build: Partial       168.226851           98
8        Code: Partial, Test: Partial, Build: No        43.636089          428
6             Code: Partial, Test: No, Build: No        30.822722           70
4        Code: No, Test: Partial, Build: Partial        29.720958           20
0                  Code: No, Test: No, Build: No        26.053962          307
7        Code: Partial, Test: No, Build: Partial        13.661250           34
5                 Code: No, Test: Yes, Build: No        13.158641          386
2                 Code: No, Test: No, Build: Yes        11.875470          107
3             Code: No, Test: Partial, Build: No         3.890202          113
10                Code: Yes, Test: No, Build: No         2.881019          630
1             Code: No, Test: No, Build: Partial         2.788819           12


In [8]:
# Cell 8: Correlation Between Code Metrics and Time to Fix

# List of code metric columns
metrics_cols = ['metrics_additions', 'metrics_deletions', 'metrics_changes', 'metrics_num_files_changed']

# # Create a pair plot for time to fix and code metrics
# sns.pairplot(df, vars=metrics_cols + ['time_to_fix_hours'], kind='scatter', diag_kind='kde', plot_kws={'alpha': 0.5})
# plt.suptitle('Pair Plot of Code Metrics and Time to Fix', y=1.02)
# plt.show()

# Calculate correlation matrix
corr_matrix = df[metrics_cols + ['time_to_fix_hours']].corr()

print("Correlation Matrix:\n")
print(corr_matrix)

# # Visualize the correlation matrix using a heatmap
# plt.figure(figsize=(8, 6))
# sns.heatmap(corr_matrix, annot=True, cmap='Blues', fmt=".2f", vmin=-1, vmax=1)
# plt.title('Correlation Matrix of Code Metrics and Time to Fix')
# plt.tight_layout()
# plt.show()

# # Scatter plots with regression lines
# for col in metrics_cols:
#     plt.figure(figsize=(8, 5))
#     sns.regplot(x=col, y='time_to_fix_hours', data=df, scatter_kws={'alpha':0.3}, line_kws={'color':'red'})
#     plt.xlabel(col.replace('_', ' ').title())
#     plt.ylabel('Time to Fix (Hours)')
#     plt.title(f'Time to Fix vs. {col.replace("_", " ").title()}')
#     plt.tight_layout()
#     plt.show()

Correlation Matrix:

                           metrics_additions  metrics_deletions  \
metrics_additions                   1.000000           0.194291   
metrics_deletions                   0.194291           1.000000   
metrics_changes                     0.757254           0.787802   
metrics_num_files_changed           0.675141           0.252654   
time_to_fix_hours                   0.278012           0.125551   

                           metrics_changes  metrics_num_files_changed  \
metrics_additions                 0.757254                   0.675141   
metrics_deletions                 0.787802                   0.252654   
metrics_changes                   1.000000                   0.592136   
metrics_num_files_changed         0.592136                   1.000000   
time_to_fix_hours                 0.258154                   0.279340   

                           time_to_fix_hours  
metrics_additions                   0.278012  
metrics_deletions                   0.12555

In [9]:
# Cell 9: Programming Language Analysis

# Calculate average and median time to fix per programming language
lang_fix_time = df.groupby('lang')['time_to_fix_hours'].agg(['mean', 'median', 'count']).reset_index()
lang_fix_time.rename(columns={'mean': 'avg_time_to_fix', 'median': 'median_time_to_fix', 'count': 'artifact_count'}, inplace=True)

print("Average and Median Time to Fix by Programming Language:\n")
print(lang_fix_time.sort_values(by='avg_time_to_fix', ascending=False))

# # Visualize the distribution of time to fix per programming language
# plt.figure(figsize=(12, 6))
# sns.boxplot(data=df, x='lang', y='time_to_fix_hours', palette='Set2')

# plt.title('Distribution of Time to Fix by Programming Language')
# plt.xlabel('Programming Language')
# plt.ylabel('Time to Fix (Hours)')
# plt.yscale('log')  # Use logarithmic scale to handle wide range of values
# plt.tight_layout()
# plt.show()

# Additional statistical test: ANOVA to check if differences are significant
from scipy.stats import f_oneway

# Prepare data for ANOVA
languages = df['lang'].unique()
lang_groups = [df[df['lang'] == lang]['time_to_fix_hours'] for lang in languages]

# Perform one-way ANOVA
anova_result = f_oneway(*lang_groups)

print(f"ANOVA Results:\nF-statistic: {anova_result.statistic:.2f}, p-value: {anova_result.pvalue:.4f}")

if anova_result.pvalue < 0.05:
    print("The differences in time to fix between programming languages are statistically significant.")
else:
    print("The differences in time to fix between programming languages are not statistically significant.")


Average and Median Time to Fix by Programming Language:

     lang  avg_time_to_fix  median_time_to_fix  artifact_count
0    Java        28.006734            0.460556            1529
1  Python        18.217868            0.503889             679
ANOVA Results:
F-statistic: 0.98, p-value: 0.3212
The differences in time to fix between programming languages are not statistically significant.


In [10]:
# Cell 10: Influence of Test Frameworks on Time to Fix

# Identify the most common test frameworks
test_framework_counts = df['test_framework'].value_counts()
print("Most Common Test Frameworks:\n")
print(test_framework_counts.head(10))

# Filter out entries with missing or unknown test frameworks
df_tf = df[df['test_framework'].notnull() & (df['test_framework'] != '')]

# Calculate average time to fix per test framework
tf_fix_time = df_tf.groupby('test_framework')['time_to_fix_hours'].agg(['mean', 'count']).reset_index()
tf_fix_time.rename(columns={'mean': 'avg_time_to_fix', 'count': 'artifact_count'}, inplace=True)

# Filter test frameworks with a minimum number of occurrences
min_artifacts = 10
tf_fix_time_filtered = tf_fix_time[tf_fix_time['artifact_count'] >= min_artifacts]

# Sort by average time to fix
tf_fix_time_sorted = tf_fix_time_filtered.sort_values(by='avg_time_to_fix', ascending=False)

print("\nAverage Time to Fix by Test Framework (with at least 10 artifacts):\n")
print(tf_fix_time_sorted)

# # Visualize the results
# plt.figure(figsize=(12, 6))
# sns.barplot(data=tf_fix_time_sorted, x='avg_time_to_fix', y='test_framework', palette='magma')

# plt.title('Average Time to Fix by Test Framework')
# plt.xlabel('Average Time to Fix (Hours)')
# plt.ylabel('Test Framework')
# plt.tight_layout()
# plt.show()

Most Common Test Frameworks:

test_framework
JUnit           687
unittest        392
pytest           73
testng#JUnit      2
Name: count, dtype: int64

Average Time to Fix by Test Framework (with at least 10 artifacts):

  test_framework  avg_time_to_fix  artifact_count
0          JUnit        38.173228             687
3       unittest        10.690843             392
1         pytest         6.049361              73


In [11]:
# Cell 11: Identifying Repositories with Unusual Fix Times

# Calculate average time to fix per repository
repo_fix_time = df.groupby('repo')['time_to_fix_hours'].agg(['mean', 'median', 'count']).reset_index()
repo_fix_time.rename(columns={'mean': 'avg_time_to_fix', 'median': 'median_time_to_fix', 'count': 'artifact_count'}, inplace=True)

# Filter repositories with a minimum number of artifacts
min_artifacts = 5
repo_fix_time_filtered = repo_fix_time[repo_fix_time['artifact_count'] >= min_artifacts]

# Sort repositories by average time to fix
repo_fix_time_sorted = repo_fix_time_filtered.sort_values(by='avg_time_to_fix', ascending=False)

# Top 10 repositories with the highest average time to fix
top_slow_repos = repo_fix_time_sorted.head(10)
print("Top 10 Repositories with Highest Average Time to Fix:\n")
print(top_slow_repos)

# Top 10 repositories with the lowest average time to fix
top_fast_repos = repo_fix_time_sorted.tail(10)
print("\nTop 10 Repositories with Lowest Average Time to Fix:\n")
print(top_fast_repos)

# Visualize the results
# Combine the top and bottom repositories for visualization
top_bottom_repos = pd.concat([top_slow_repos, top_fast_repos])

# plt.figure(figsize=(14, 8))
# sns.barplot(data=top_bottom_repos, x='avg_time_to_fix', y='repo', palette='Spectral')

# plt.title('Repositories with Unusual Average Time to Fix')
# plt.xlabel('Average Time to Fix (Hours)')
# plt.ylabel('Repository')
# plt.tight_layout()
# plt.show()

Top 10 Repositories with Highest Average Time to Fix:

                                repo  avg_time_to_fix  median_time_to_fix  \
73                   ocpsoft/rewrite      1229.184048            1.276667   
82               petergeneric/stdlib      1124.205926            0.156667   
37                  charite/jannovar       685.058380          707.373056   
64                 kairosdb/kairosdb       203.476438          104.273333   
100                      square/wire        73.910873            1.040000   
91                 sannies/mp4parser        60.248704            7.113611   
95    spring-projects/spring-hateoas        59.519559           16.690556   
48                       google/auto        51.578611           22.605417   
94   spring-projects/spring-data-jpa        45.267814            0.861389   
103          terasolunaorg/guideline        39.783861            1.136111   

     artifact_count  
73                7  
82                9  
37                6  
64       