**Pearson's Chi-squared Test**

*Categorical variable independence test between percolation clusters & poi clusters/affinity clusters*

# 1 Import package

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, norm

# 2 Independence tests

In [3]:
# Merge all cluster numbers

percolation_df = pd.read_csv('../2-percolation/hexagon_get_percolation_cluster_id/percolation_cluster_id.csv')
poi_clustering_df = pd.read_csv('../3-profile_clustering/1-poi_clustering/step4-clustering&optimal_cluster_num/poi_clustering_results.csv')
affinity_clustering_df = pd.read_csv('../3-profile_clustering/2-affinity_clustering/step2-clustering/affinity_clustering_results.csv')


percolation_df = percolation_df[['gid_10', 'percolation_cluster_id']]
poi_clustering_df = poi_clustering_df[['gid_10', 'poi_cluster_id']]
affinity_clustering_df = affinity_clustering_df[['gid_10', 'affinity_cluster_id']]


merged_df = pd.merge(percolation_df, poi_clustering_df, on='gid_10', how='outer')
merged_df = pd.merge(merged_df, affinity_clustering_df, on='gid_10', how='outer')


print(merged_df.info())


merged_df.to_csv('./all_clusters.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403921 entries, 0 to 403920
Data columns (total 4 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   gid_10                  403921 non-null  object 
 1   percolation_cluster_id  323256 non-null  float64
 2   poi_cluster_id          63548 non-null   float64
 3   affinity_cluster_id     82471 non-null   float64
dtypes: float64(3), object(1)
memory usage: 12.3+ MB
None


In [4]:
merged_df.head()

Unnamed: 0,gid_10,percolation_cluster_id,poi_cluster_id,affinity_cluster_id
0,8a01261212affff,13696.0,,
1,8a012612175ffff,13696.0,,
2,8a0126182447fff,13696.0,,
3,8a0126182467fff,13696.0,,
4,8a012618246ffff,13696.0,,


In [None]:
import pandas as pd
import scipy.stats as stats

# Read the merged data
df = pd.read_csv('./all_clusters.csv')

# Calculate chi-squared test results between percolation_cluster_id and other clusters, excluding rows with missing values, and check if the sample size for percolation_cluster_id is 50 or more
def calculate_chi2_percolation(var1, var2, df):
    # Drop rows with missing values
    df_clean = df.dropna(subset=[var1, var2])

    # Keep only rows where the sample size for percolation_cluster_id is 10 or more
    df_filtered = df_clean[df_clean.groupby('percolation_cluster_id')['percolation_cluster_id'].transform('count') >= 10]

    # If there are not enough samples after filtering, skip this test
    if df_filtered.empty:
        print(f"No percolation_cluster_id has 10 or more instances for the test between {var1} and {var2}. Skipping this test.")
        return None

    # Construct a contingency table
    contingency_table = pd.crosstab(df_filtered[var1], df_filtered[var2])

    # Perform chi-squared test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

    # Determine significance
    significance = "Significant" if p < 0.05 else "Not Significant"

    # Output the test result
    print(f"Chi-squared test between {var1} and {var2}")
    print(f"Chi2 statistic: {chi2:.4f}, p-value: {p:.4f}, Degrees of Freedom: {dof}")
    if significance == "Significant":
        print(f"The relationship between {var1} and {var2} is statistically significant (p < 0.05).")
    else:
        print(f"The relationship between {var1} and {var2} is not statistically significant (p >= 0.05).")
    print("\n")  # Separate the output of different tests

    # Return the result as a dictionary
    return {
        "Test": f"{var1} vs {var2}",
        "Chi2 Statistic": chi2,
        "p-value": p,
        "Degrees of Freedom": dof,
        "Significance": significance
    }

# Calculate chi-squared test results between poi_cluster_id and affinity_cluster_id
def calculate_chi2_general(var1, var2, df):
    # Drop rows with missing values
    df_clean = df.dropna(subset=[var1, var2])

    # Construct a contingency table
    contingency_table = pd.crosstab(df_clean[var1], df_clean[var2])

    # Perform chi-squared test
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

    # Determine significance
    significance = "Significant" if p < 0.05 else "Not Significant"

    # Output the test result
    print(f"Chi-squared test between {var1} and {var2}")
    print(f"Chi2 statistic: {chi2:.4f}, p-value: {p:.4f}, Degrees of Freedom: {dof}")
    if significance == "Significant":
        print(f"The relationship between {var1} and {var2} is statistically significant (p < 0.05).")
    else:
        print(f"The relationship between {var1} and {var2} is not statistically significant (p >= 0.05).")
    print("\n")  # Separate the output of different tests

    # Return the result as a dictionary
    return {
        "Test": f"{var1} vs {var2}",
        "Chi2 Statistic": chi2,
        "p-value": p,
        "Degrees of Freedom": dof,
        "Significance": significance
    }

# Calculate the results of the first two tests (involving percolation_cluster_id)
results = []

# Test percolation_cluster_id vs poi_cluster_id
result1 = calculate_chi2_percolation('percolation_cluster_id', 'poi_cluster_id', df)
if result1 is not None:
    results.append(result1)

# Test percolation_cluster_id vs affinity_cluster_id
result2 = calculate_chi2_percolation('percolation_cluster_id', 'affinity_cluster_id', df)
if result2 is not None:
    results.append(result2)

# Calculate the result of the third test (not involving percolation_cluster_id)
# Test poi_cluster_id vs affinity_cluster_id
result3 = calculate_chi2_general('poi_cluster_id', 'affinity_cluster_id', df)
if result3 is not None:
    results.append(result3)

# If there are results, save them to a CSV file
if results:
    results_df = pd.DataFrame(results)
    results_df.to_csv('./chi-square_test_results_add_volumn_check.csv', index=False)
    print("Chi-square test results have been saved to './chi-square_test_results_add_volumn_check.csv'")
else:
    print("No tests were performed due to insufficient sample sizes.")

# 3 Correlation index calculate

In [None]:
import pandas as pd
import scipy.stats as stats
import numpy as np

# Read the merged data
df = pd.read_csv('./all_clusters.csv')

# Define a function to calculate Cramér's V and only retain rows where percolation_cluster_id has 10 or more samples
def calculate_cramers_v(var1, var2, df):
    # Drop rows with missing values
    df_clean = df.dropna(subset=[var1, var2])

    # Only retain rows where percolation_cluster_id has 10 or more samples
    df_filtered = df_clean[df_clean.groupby('percolation_cluster_id')['percolation_cluster_id'].transform('count') >= 10]

    # If there is not enough data after filtering, return None
    if df_filtered.empty:
        print(f"After filtering, there are no {var1} instances with 10 or more records for the test between {var1} and {var2}.")
        return None

    # Construct a contingency table
    contingency_table = pd.crosstab(df_filtered[var1], df_filtered[var2])

    # Calculate chi-squared statistic
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

    # Calculate Cramér's V
    n = contingency_table.sum().sum()  # Total number of samples
    min_dim = min(contingency_table.shape) - 1  # Use the smaller dimension minus 1
    cramers_v = np.sqrt(chi2 / (n * min_dim))

    # Determine the strength of the association
    if cramers_v < 0.1:
        strength = "None or very weak"
    elif cramers_v < 0.3:
        strength = "Weak"
    elif cramers_v < 0.5:
        strength = "Moderate"
    else:
        strength = "Strong"

    print(f"Cramér's V between {var1} and {var2}: {cramers_v:.4f} (Strength: {strength})")
    return {
        "Test": f"{var1} vs {var2}",
        "Cramér's V": cramers_v,
        "Strength": strength
    }

# List to store the results
results = []

# Calculate Cramér's V between percolation_cluster_id and poi_cluster_id
cramers_v1 = calculate_cramers_v('percolation_cluster_id', 'poi_cluster_id', df)
if cramers_v1 is not None:
    results.append(cramers_v1)

# Calculate Cramér's V between percolation_cluster_id and affinity_cluster_id
cramers_v2 = calculate_cramers_v('percolation_cluster_id', 'affinity_cluster_id', df)
if cramers_v2 is not None:
    results.append(cramers_v2)

# If there are results, save them to a CSV file
if results:
    results_df = pd.DataFrame(results)
    results_df.to_csv('./cramers_v_results.csv', index=False)
    print("Cramér's V results have been saved to './cramers_v_results.csv'")
else:
    print("No results to save.")