In [None]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files

# Read the CSV file
file_path = "/content/Cp_4metrics.csv"
df = pd.read_csv(file_path)

# Clean column names by stripping leading/trailing spaces
df.columns = df.columns.str.strip()

# Define complete list of test smells and change metrics
test_smell_columns = [
    "Assertion Roulette",
    "Conditional Test Logic",
    "Constructor Initialization",
    "Duplicate Assertion",
    "Empty Test",
    "Exception Handling",
    "General Fixture",
    "Lack of Cohesion of Test Cases",
    "Magic Number Test",
    "Obscure In-Line Setup",
    "Redundant Assertion",
    "Redundant Print",
    "Sleepy Test",
    "Suboptimal Assert",
    "Test Maverick",
    "Total_Smells"
]

change_metric_columns = [
    "Prod_FaultFreq",
    "Prod_FaultExtension",
    "Test_FaultyFreq",
    "Test_FaultExt"
]

# Ensure all specified columns exist in the dataframe
missing_cols = [col for col in test_smell_columns + change_metric_columns if col not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in the dataset: {missing_cols}")

# Initialize matrices for correlations and p-values
corr_matrix = pd.DataFrame(index=test_smell_columns, columns=change_metric_columns)
pval_matrix = pd.DataFrame(index=test_smell_columns, columns=change_metric_columns)
sig_matrix = pd.DataFrame(index=test_smell_columns, columns=change_metric_columns)

# Calculate Spearman correlation and p-values
for test_smell in test_smell_columns:
    for change_metric in change_metric_columns:
        # Drop rows with NaN values for the pair
        valid_data = df[[test_smell, change_metric]].dropna()
        if len(valid_data) > 1:  # Ensure there's enough data
            corr, pval = spearmanr(valid_data[test_smell], valid_data[change_metric])
            corr_matrix.loc[test_smell, change_metric] = corr
            pval_matrix.loc[test_smell, change_metric] = pval
            sig_matrix.loc[test_smell, change_metric] = 1 if pval < 0.05 else 0
        else:
            corr_matrix.loc[test_smell, change_metric] = np.nan
            pval_matrix.loc[test_smell, change_metric] = np.nan
            sig_matrix.loc[test_smell, change_metric] = 0

# Convert matrices to float/int for better formatting
corr_matrix = corr_matrix.astype(float).round(3)
pval_matrix = pval_matrix.astype(float).round(4)
sig_matrix = sig_matrix.astype(int)

# Save correlation and significance matrices to CSV
corr_matrix.to_csv('spearman_correlations.csv')
sig_matrix.to_csv('significance_matrix.csv')

# Create and display heatmap for correlations
plt.figure(figsize=(10, 12))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.3f')
plt.title('Spearman Correlation Heatmap')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.show()  # Display the heatmap in Colab
files.download('correlation_heatmap.png')  # Auto-download the heatmap

# Create and display heatmap for significance
plt.figure(figsize=(10, 12))
sns.heatmap(sig_matrix, annot=True, cmap='Blues', cbar=False, fmt='d')
plt.title('Significance Matrix (1 = Significant, 0 = Not Significant)')
plt.tight_layout()
plt.savefig('significance_heatmap.png')
plt.show()  # Display the heatmap in Colab
files.download('significance_heatmap.png')  # Auto-download the heatmap

# Auto-download CSV files
files.download('spearman_correlations.csv')
files.download('significance_matrix.csv')

print("Analysis complete. Generated and downloaded files:")
print("- spearman_correlations.csv: Correlation values")
print("- significance_matrix.csv: Significance indicators (1=significant, 0=not significant)")
print("- correlation_heatmap.png: Correlation heatmap")
print("- significance_heatmap.png: Significance heatmap")
print("Heatmaps displayed above and files automatically downloaded.")