In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files

# Read the CSV file
file_path = "/content/FaultProneness_4metrics.csv"  # Update path as needed
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip()

# Test smell columns (same as before)
test_smell_columns = [
    "Assertion Roulette",
    "Conditional Test Logic",
    "Constructor Initialization",
    "Duplicate Assertion",
    "Empty Test",
    "Exception Handling",
    "General Fixture",
    "Lack of Cohesion of Test Cases",
    "Magic Number Test",
    "Obscure In-Line Setup",
    "Redundant Assertion",
    "Redundant Print",
    "Sleepy Test",
    "Suboptimal Assert",
    "Test Maverick",
    "Total_Smells"
]

# Updated fault proneness metrics
fault_metric_columns = [
    "Prod_FaultFreq",
    "Prod_FaultExtension",
    "Test_FaultyFreq",
    "Test_FaultExt"
]

# Check for missing columns
missing_cols = [col for col in test_smell_columns + fault_metric_columns if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in the dataset: {missing_cols}")

# Initialize output matrices
corr_matrix = pd.DataFrame(index=test_smell_columns, columns=fault_metric_columns)
pval_matrix = pd.DataFrame(index=test_smell_columns, columns=fault_metric_columns)
sig_matrix = pd.DataFrame(index=test_smell_columns, columns=fault_metric_columns)

# Compute Spearman correlations
for test_smell in test_smell_columns:
    for fault_metric in fault_metric_columns:
        valid_data = df[[test_smell, fault_metric]].dropna()
        if len(valid_data) > 1:
            corr, pval = spearmanr(valid_data[test_smell], valid_data[fault_metric])
            corr_matrix.loc[test_smell, fault_metric] = corr
            pval_matrix.loc[test_smell, fault_metric] = pval
            sig_matrix.loc[test_smell, fault_metric] = 1 if pval < 0.05 else 0
        else:
            corr_matrix.loc[test_smell, fault_metric] = np.nan
            pval_matrix.loc[test_smell, fault_metric] = np.nan
            sig_matrix.loc[test_smell, fault_metric] = 0

# Format output
corr_matrix = corr_matrix.astype(float).round(3)
pval_matrix = pval_matrix.astype(float).round(4)
sig_matrix = sig_matrix.astype(int)

# Save results
corr_matrix.to_csv('fp_spearman_correlations.csv')
sig_matrix.to_csv('fp_significance_matrix.csv')

# Correlation heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Spearman Correlation Heatmap – Fault Proneness')
plt.tight_layout()
plt.savefig('fp_correlation_heatmap.png')
plt.show()
files.download('fp_correlation_heatmap.png')

# Significance heatmap
plt.figure(figsize=(10, 12))
sns.heatmap(sig_matrix, annot=True, cmap='Blues', cbar=False, fmt='d')
plt.title('Significance Matrix – Fault Proneness (1 = Significant)')
plt.tight_layout()
plt.savefig('fp_significance_heatmap.png')
plt.show()
files.download('fp_significance_heatmap.png')

# Download CSVs
files.download('fp_spearman_correlations.csv')
files.download('fp_significance_matrix.csv')

print(" Fault Proneness analysis complete. Files generated:")
print("- fp_spearman_correlations.csv")
print("- fp_significance_matrix.csv")
print("- fp_correlation_heatmap.png")
print("- fp_significance_heatmap.png")
