In [None]:
# List of file paths or directory containing your CSV files
import pandas as pd

csv_files = [
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen10_3723_transcript.csv", 
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen10_3946_transcript.csv",
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen12_3990_transcript.csv", 
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen12_KPMP_038_transcript.csv",
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen7_3916_transcript.csv",  
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen4_3781_transcript.csv", 
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen5_3916_transcript.csv", 
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen6_3781_transcript.csv",  
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen17_3612_transcript.csv",
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen16_3609_transcript.csv",
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen13_3782_transcript.csv", 
    "/storage2/fs1/sanjayjain/Active/Asmita/Xen21_KPMP_057.csv"
]

# Create an empty list to hold DataFrames
dataframes = []

# Read each CSV, add a new column indicating the source file, and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    df["Sample_ID"] = file.split("/")[-1]  # Extracts folder name as identifier
    dataframes.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(dataframes, ignore_index=True)

# Display first few rows
#print(merged_df.head())

In [None]:
merged_df

In [None]:
merged_df["Sample_ID"] = merged_df["Sample_ID"].str.replace("_transcript.csv", "", regex=False)

In [None]:
cell_feature_df = merged_df

In [None]:
cell_feature_df

In [None]:
pt_df = cell_feature_df[cell_feature_df["group"].str.contains("PT", na=False)] 
pt_df

In [None]:
pt_df = pt_df[~pt_df['group'].str.contains('PTC', na=False)]

In [None]:
new_pt_df = pd.crosstab(pt_df['Sample_ID'], pt_df['group'])

In [None]:
new_pt_df["Total_PT"] = new_pt_df.sum(axis=1) 
new_pt_df

In [None]:
import pandas as pd

# Load the data (assuming it's in a CSV file)
df = pt_df

# Group by 'Sample_ID' and sum the 'feature_count' column
df_summary = df.groupby("Sample_ID", as_index=False)["feature_count"].sum()

# Rename the column for clarity
df_summary.rename(columns={"feature_count": "total_feature_count"}, inplace=True)

# Display the new DataFrame
#print(df_summary)

##  ALtered PT

In [None]:
def create_gene_count_df(cell_feature_df, sample_info):
    """
    Creates a DataFrame with total counts of CCL2+SOX4, CCL2+SOX9,CCL2+SOX4+SOX9, PCK1 +ve cells per sample,
    total PT cells, and a classification column for Reference (Ref) or CKD.
    Also includes a column counting cells where at least one of these gene pairs appears more than twice,
    along with lists of cell IDs for each category.
    Additionally, counts cells that are positive for just SOX4, PCK1 and SOX9, and collects
    all PT cell IDs per sample under 'total_PT_cellids'.

    Parameters:
    - cell_feature_df: DataFrame with columns ['Sample_ID', 'group', 'feature_names', 'cell_id']
    - sample_info: DataFrame with columns ['Sample_ID', 'Category'] mapping each sample to Ref or CKD

    Returns:
    - merged_df: DataFrame with total counts, PT‐cell counts, PT cell IDs, and sample labels
    - altered_df: the filtered/annotated DataFrame used for calculating gene‐specific counts
    """
    import pandas as pd

    # 1) Filter for PT cells
    pt_df = cell_feature_df[cell_feature_df["group"].str.contains("PT", na=False)].copy()

    # 2) Further subset to only include healthy or diseased
    altered_df = pt_df[pt_df["group"].str.contains("aPT", na=False)].copy()
    #altered_df = pt_df[pt_df["group"].str.contains("PT|PT-S3|PT-S1/2", na=False)].copy()
    #altered_df = pt_df
    # 3) Helper to count occurrences of a specific gene in the feature list
    def count_gene(features, gene):
        return features.count(gene)

    # 3a) Annotate altered_df with raw counts per gene
    altered_df["SOX4_count"] = altered_df["feature_names"].apply(lambda x: count_gene(x, "SOX4"))
    altered_df["PCK1_count"] = altered_df["feature_names"].apply(lambda x: count_gene(x, "PCK1"))

    # 3b) Helper to check if all genes in a list appear more than twice
    def count_genes(features, genes):
        return all(features.count(g) > 2 for g in genes)

    # 3c) Define CCL2‐related pairs of interest
    gene_pairs_of_interest = [("CCL2", "SOX9"), ("CCL2", "SOX4")]

    def has_gene_pair(features, gene_pairs):
        return any(features.count(g1) > 2 and features.count(g2) > 2 for g1, g2 in gene_pairs)

    # 3d) Boolean flags for positivity (>2 counts)
    altered_df["SOX4_positive"] = altered_df["SOX4_count"] > 2
    altered_df["PCK1_positive"] = altered_df["PCK1_count"] > 2

    # 4) Summarize counts and collect cell IDs per sample
    summary_df = altered_df.groupby("Sample_ID").agg(
        total_PCK1_cells=("PCK1_positive", "sum"),
        SOX4_cells=("SOX4_positive", "sum"),
        SOX4_cell_ids=(
            "cell_id",
            lambda x: list(x[altered_df.loc[x.index, "SOX4_positive"]])
        ),
        PCK1_cell_ids=("cell_id", lambda x: list(x[altered_df.loc[x.index, "PCK1_positive"]])),
        )

    # 5) Count total PT cells per sample (for the “pt_df”)
    total_pt_cells_df = pt_df.groupby("Sample_ID").agg(total_PT_cells=("Sample_ID", "count")).reset_index()

    # 6) Collect all PT cell IDs per sample
    pt_cell_ids_df = pt_df.groupby("Sample_ID").agg(
        total_PT_cellids=("cell_id", lambda x: list(x))
    ).reset_index()

    # 7) Merge the count summaries
    summary_df = summary_df.merge(total_pt_cells_df, on="Sample_ID", how="left")

    # 8) Merge in the PT cell‐ID lists
    summary_df = summary_df.merge(pt_cell_ids_df, on="Sample_ID", how="left")

    # 9) Finally, merge with the sample_info to bring in the Ref/CKD labels
    merged_df = summary_df.merge(sample_info, on="Sample_ID", how="left")

    return merged_df, altered_df


In [None]:
sample_info = pd.DataFrame({
    'Sample_ID': ['Xen10_3723', 'Xen10_3946', 'Xen12_3990', 'Xen12_KPMP_038', 'Xen13_3782', 
                  'Xen16_3609', 'Xen17_3612', 'Xen4_3781', 'Xen5_3916', 'Xen6_3781', 'Xen7_3916','Xen21_KPMP_057.csv'],
    'Category': ['Ref', 'CKD', 'CKD', 'Ref', 'AKI', 'CKD', 'CKD', 'Ref', 'CKD', 'Ref', 'CKD','Ref']
})

In [None]:
new_df, altered_df = create_gene_count_df(cell_feature_df, sample_info)


In [None]:
import pandas as pd

# Load the data (assuming it's in a CSV file)
df = altered_df

# Group by 'Sample_ID' and sum the 'feature_count' column
df_summary = df.groupby("Sample_ID", as_index=False)["feature_count"].sum()

# Rename the column for clarity
df_summary.rename(columns={"feature_count": "total_feature_count"}, inplace=True)

# Display the new DataFrame
#print(df_summary)


In [None]:
new_df.columns

In [None]:
import pandas as pd

# Sample structure (you would replace these with your actual DataFrames)
# summary_df = pd.read_csv("summary_data.csv")
# cell_feature_df = pd.read_csv("cell_feature_data.csv")

# Convert feature_count column to numeric if it's not already
cell_feature_df["feature_count"] = pd.to_numeric(cell_feature_df["feature_count"], errors="coerce")

# Function to sum feature counts for cell IDs in a given column
def sum_feature_counts(cell_id_list):
    if isinstance(cell_id_list, list):  # Ensure it's a list
        return cell_feature_df[cell_feature_df["cell_id"].isin(cell_id_list)]["feature_count"].sum()
    return 0

# Apply the function to each relevant column
for col in ["PCK1_cell_ids", "SOX4_cell_ids"]:
    new_df[f"{col}_total_features"] = new_df[col].apply(sum_feature_counts)

# Display the updated summary_df
#print(new_df.head())


In [None]:
new_df

In [None]:
import pandas as pd
# Excluding Samples not used 
# Given data
df = new_df

# Exclude Xen12_KPMP_038 (Ref) and Xen13_3782 (AKI)
df_filtered = df[~df["Sample_ID"].isin(["Xen12_KPMP_038", "Xen13_3782" , "Xen16_3609"])]

# Select remaining Ref samples and any 4 CKD samples
ref_samples = df_filtered[df_filtered["Category"] == "Ref"]
ckd_samples = df_filtered[df_filtered["Category"] == "CKD"].iloc[:3]  # Select any 4 CKD samples

In [None]:
ref_samples

In [None]:
ckd_samples

In [None]:
#when PT is set to aPT, dPT, frPT
ref_altered_samples = ref_samples
ckd_altered_samples = ckd_samples

## for Healthy PT 

In [None]:
def create_gene_count_df(cell_feature_df, sample_info):
    """
    Creates a DataFrame with total counts of CCL2+SOX4, CCL2+SOX9,CCL2+SOX4+SOX9, PCK1 +ve cells per sample,
    total PT cells, and a classification column for Reference (Ref) or CKD.
    Also includes a column counting cells where at least one of these gene pairs appears more than twice,
    along with lists of cell IDs for each category.
    Additionally, counts cells that are positive for just SOX4, PCK1 and SOX9, and collects
    all PT cell IDs per sample under 'total_PT_cellids'.

    Parameters:
    - cell_feature_df: DataFrame with columns ['Sample_ID', 'group', 'feature_names', 'cell_id']
    - sample_info: DataFrame with columns ['Sample_ID', 'Category'] mapping each sample to Ref or CKD

    Returns:
    - merged_df: DataFrame with total counts, PT‐cell counts, PT cell IDs, and sample labels
    - altered_df: the filtered/annotated DataFrame used for calculating gene‐specific counts
    """
    import pandas as pd

    # 1) Filter for PT cells
    pt_df = cell_feature_df[cell_feature_df["group"].str.contains("PT", na=False)].copy()

    # 2) Further subset to only include healthy or diseased
    #altered_df = pt_df[pt_df["group"].str.contains("aPT|dPT|frPT", na=False)].copy()
    altered_df = pt_df[pt_df["group"].str.contains("PT|PT-S3|PT-S1/2|PT-S1", na=False)].copy()
    #altered_df = pt_df
    # 3) Helper to count occurrences of a specific gene in the feature list
    def count_gene(features, gene):
        return features.count(gene)

    # 3a) Annotate altered_df with raw counts per gene
    altered_df["SOX4_count"] = altered_df["feature_names"].apply(lambda x: count_gene(x, "SOX4"))
    altered_df["PCK1_count"] = altered_df["feature_names"].apply(lambda x: count_gene(x, "PCK1"))

    # 3b) Helper to check if all genes in a list appear more than twice
    def count_genes(features, genes):
        return all(features.count(g) > 2 for g in genes)

    # 3c) Define CCL2‐related pairs of interest
    gene_pairs_of_interest = [("CCL2", "SOX9"), ("CCL2", "SOX4")]

    def has_gene_pair(features, gene_pairs):
        return any(features.count(g1) > 2 and features.count(g2) > 2 for g1, g2 in gene_pairs)

    # 3d) Boolean flags for positivity (>2 counts)
    altered_df["SOX4_positive"] = altered_df["SOX4_count"] > 2
    altered_df["PCK1_positive"] = altered_df["PCK1_count"] > 2

    # 4) Summarize counts and collect cell IDs per sample
    summary_df = altered_df.groupby("Sample_ID").agg(
        total_PCK1_cells=("PCK1_positive", "sum"),
        SOX4_cells=("SOX4_positive", "sum"),
        SOX4_cell_ids=(
            "cell_id",
            lambda x: list(x[altered_df.loc[x.index, "SOX4_positive"]])
        ),
        PCK1_cell_ids=("cell_id", lambda x: list(x[altered_df.loc[x.index, "PCK1_positive"]])),
        )

    # 5) Count total PT cells per sample (for the “pt_df”)
    total_pt_cells_df = pt_df.groupby("Sample_ID").agg(total_PT_cells=("Sample_ID", "count")).reset_index()

    # 6) Collect all PT cell IDs per sample
    pt_cell_ids_df = pt_df.groupby("Sample_ID").agg(
        total_PT_cellids=("cell_id", lambda x: list(x))
    ).reset_index()

    # 7) Merge the count summaries
    summary_df = summary_df.merge(total_pt_cells_df, on="Sample_ID", how="left")

    # 8) Merge in the PT cell‐ID lists
    summary_df = summary_df.merge(pt_cell_ids_df, on="Sample_ID", how="left")

    # 9) Finally, merge with the sample_info to bring in the Ref/CKD labels
    merged_df = summary_df.merge(sample_info, on="Sample_ID", how="left")

    return merged_df, altered_df


In [None]:
sample_info = pd.DataFrame({
    'Sample_ID': ['Xen10_3723', 'Xen10_3946', 'Xen12_3990', 'Xen12_KPMP_038', 'Xen13_3782', 
                  'Xen16_3609', 'Xen17_3612', 'Xen4_3781', 'Xen5_3916', 'Xen6_3781', 'Xen7_3916','Xen21_KPMP_057.csv'],
    'Category': ['Ref', 'CKD', 'CKD', 'Ref', 'AKI', 'CKD', 'CKD', 'Ref', 'CKD', 'Ref', 'CKD','Ref']
})

In [None]:
new_df, altered_df = create_gene_count_df(cell_feature_df, sample_info)

In [None]:
import pandas as pd

# Load the data (assuming it's in a CSV file)
df = altered_df

# Group by 'Sample_ID' and sum the 'feature_count' column
df_summary = df.groupby("Sample_ID", as_index=False)["feature_count"].sum()

# Rename the column for clarity
df_summary.rename(columns={"feature_count": "total_feature_count"}, inplace=True)

# Display the new DataFrame
#print(df_summary)

In [None]:
import pandas as pd

# Sample structure (you would replace these with your actual DataFrames)
# summary_df = pd.read_csv("summary_data.csv")
# cell_feature_df = pd.read_csv("cell_feature_data.csv")

# Convert feature_count column to numeric if it's not already
cell_feature_df["feature_count"] = pd.to_numeric(cell_feature_df["feature_count"], errors="coerce")

# Function to sum feature counts for cell IDs in a given column
def sum_feature_counts(cell_id_list):
    if isinstance(cell_id_list, list):  # Ensure it's a list
        return cell_feature_df[cell_feature_df["cell_id"].isin(cell_id_list)]["feature_count"].sum()
    return 0

# Apply the function to each relevant column
for col in ["PCK1_cell_ids", "SOX4_cell_ids"]:
    new_df[f"{col}_total_features"] = new_df[col].apply(sum_feature_counts)

# Display the updated summary_df
print(new_df.head())

In [None]:
import pandas as pd

# Given data
df = new_df

# Exclude Xen12_KPMP_038 (Ref) and Xen13_3782 (AKI)
df_filtered = df[~df["Sample_ID"].isin(["Xen12_KPMP_038", "Xen13_3782" , "Xen16_3609"])]

# Select remaining Ref samples and any 4 CKD samples
ref_samples = df_filtered[df_filtered["Category"] == "Ref"]
ckd_samples = df_filtered[df_filtered["Category"] == "CKD"].iloc[:3]  # Select any 4 CKD samples

In [None]:
#when PT is set to PT / PTS1/2/PTS3 
ref_healthy_samples = ref_samples
ckd_healthy_samples = ckd_samples

In [None]:
ckd_healthy_samples

In [None]:
ckd_altered_samples

In [None]:
ref_healthy_samples

In [None]:
ref_altered_samples

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

def plot_per_sample_dots(gene_label,
                         ref_healthy_df, ref_altered_df,
                         ckd_healthy_df, ckd_altered_df,
                         save_path=None, dpi=300):
    """
    Boxplot + one dot per sample for REF vs CKD.

    gene_label : 'SOX4' or 'PCK1'
    ref_healthy_df, ref_altered_df, ckd_healthy_df, ckd_altered_df : pd.DataFrame
        Must each have columns ['Sample_ID', <gene_col>] where:
          gene_col = 'SOX4_cells' or 'total_PCK1_cells'
    save_path : str or None
    dpi : int
    """
    # pick the right summary column
    col = {'SOX4':'SOX4_cells','PCK1':'total_PCK1_cells'}.get(gene_label)
    if col is None:
        raise ValueError("gene_label must be 'SOX4' or 'PCK1'")

    # collapse healthy and altered into per-sample sums for each cohort
    ref = pd.concat([ref_healthy_df, ref_altered_df])
    ref_tot = ref.groupby('Sample_ID')[col].sum()
    ckd = pd.concat([ckd_healthy_df, ckd_altered_df])
    ckd_tot = ckd.groupby('Sample_ID')[col].sum()

    ref_vals = ref_tot.values
    ckd_vals = ckd_tot.values

    # perform two-sided Welch's t-test
    t_stat, pval = ttest_ind(ref_vals, ckd_vals, equal_var=False, nan_policy='omit')

    # begin plotting
    fig, ax = plt.subplots(figsize=(3,6))
    ax.grid(False)
    for spine in ['top','right']:
        ax.spines[spine].set_visible(False)

    # skinny boxplot
    ax.boxplot(
        [ref_vals, ckd_vals],
        widths=0.2,
        showfliers=False,
        patch_artist=True,
        boxprops=dict(facecolor='white', edgecolor='black')
    )

    # one dot per sample
    ax.scatter(np.ones_like(ref_vals), ref_vals,
               color='Blue', s=30, alpha=0.8, label='REF samples')
    print(ref_vals)
    ax.scatter(np.full_like(ckd_vals,2), ckd_vals,
               color='Orange', s=30, alpha=0.8, label='CKD samples')
    print(ckd_vals)

    # significance bar
    y_max = max(ref_vals.max(), ckd_vals.max())
    y_min = min(ref_vals.min(), ckd_vals.min())
    h = (y_max - y_min) * 0.05
    y = y_max + h
    ax.plot([1,1,2,2], [y, y+h, y+h, y], lw=1.5, color='black')
    if pval < 0.001: star='***'
    elif pval < 0.01: star='**'
    elif pval < 0.05: star='*'
    else: star='ns'
    ax.text(1.5, y + h, star, ha='center', va='bottom', fontsize=14)

    # labels
    ax.set_xticks([1,2])
    ax.set_xticklabels(['REF','CKD'], fontweight='bold')
    ax.set_ylabel(f'{gene_label}+ve PT cells', fontweight='bold')

    plt.tight_layout()

    if save_path:
        fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
        print(f"Saved figure to {save_path} at {dpi} dpi")

    plt.show()

    # report stats
    print(f"n_REF = {len(ref_vals)}, n_CKD = {len(ckd_vals)}")
    print(f"t-statistic = {t_stat:.3f}, two-tailed p = {pval:.3e}") 
    from scipy.stats import sem  # Standard Error of the Mean

    # Calculate descriptive stats
    ref_mean = np.mean(ref_vals)
    ref_sd = np.std(ref_vals, ddof=1)       # SD
    ref_sem = sem(ref_vals, nan_policy='omit')  # SEM

    ckd_mean = np.mean(ckd_vals)
    ckd_sd = np.std(ckd_vals, ddof=1)
    ckd_sem = sem(ckd_vals, nan_policy='omit')

    # Print the summary
    print(f"REF: mean = {ref_mean:.2f}, SD = {ref_sd:.2f}, SEM = {ref_sem:.2f}")
    print(f"CKD: mean = {ckd_mean:.2f}, SD = {ckd_sd:.2f}, SEM = {ckd_sem:.2f}")


In [None]:

plot_per_sample_dots(
    'SOX4',
    ref_healthy_samples, ref_altered_samples,
    ckd_healthy_samples, ckd_altered_samples, 
    save_path ="SOX4_final.png" ,
    dpi=300
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

def plot_per_sample_dots(gene_label,
                         ref_healthy_df, ref_altered_df,
                         ckd_healthy_df, ckd_altered_df,
                         save_path=None, dpi=300):
    """
    Boxplot + one dot per sample for REF vs CKD.

    gene_label : 'SOX4' or 'PCK1'
    ref_healthy_df, ref_altered_df, ckd_healthy_df, ckd_altered_df : pd.DataFrame
        Must each have columns ['Sample_ID', <gene_col>] where:
          gene_col = 'SOX4_cells' or 'total_PCK1_cells'
    save_path : str or None
    dpi : int
    """
    # pick the right summary column
    col = {'SOX4':'SOX4_cells','PCK1':'total_PCK1_cells'}.get(gene_label)
    if col is None:
        raise ValueError("gene_label must be 'SOX4' or 'PCK1'")

    # collapse healthy and altered into per-sample sums for each cohort
    ref = pd.concat([ref_healthy_df, ref_altered_df])
    ref_tot = ref.groupby('Sample_ID')[col].sum()
    ckd = pd.concat([ckd_healthy_df, ckd_altered_df])
    ckd_tot = ckd.groupby('Sample_ID')[col].sum()

    healthy_vals = ref_tot.values
    altered_vals = ckd_tot.values

    # perform two-sided Welch's t-test
    t_stat, pval = ttest_ind(healthy_vals, altered_vals, equal_var=False, nan_policy='omit')

    # begin plotting
    fig, ax = plt.subplots(figsize=(4,6))
    ax.grid(False)
    for spine in ['top','right']:
        ax.spines[spine].set_visible(False)

    # skinny boxplot
    ax.boxplot(
        [healthy_vals, altered_vals],
        widths=0.2,
        showfliers=False,
        patch_artist=True,
        boxprops=dict(facecolor='white', edgecolor='black')
    )

    # one dot per sample
    ax.scatter(np.ones_like(healthy_vals), healthy_vals,
               color='Blue', s=30, alpha=0.8, label='REF samples')
    print(healthy_vals)
    ax.scatter(np.full_like(altered_vals,2), altered_vals,
               color='Orange', s=30, alpha=0.8, label='CKD samples')
    print(altered_vals)

    # significance bar
    y_max = max(healthy_vals.max(), altered_vals.max())
    y_min = min(healthy_vals.min(), altered_vals.min())
    h = (y_max - y_min) * 0.05
    y = y_max + h
    ax.plot([1,1,2,2], [y, y+h, y+h, y], lw=1.5, color='black')
    if pval < 0.001: star='***'
    elif pval < 0.01: star='**'
    elif pval < 0.05: star='*'
    else: star='ns'
    ax.text(1.5, y + h, star, ha='center', va='bottom', fontsize=14)

    # labels
    ax.set_xticks([1,2])
    ax.set_xticklabels(['Healthy','Altered'], fontweight='bold')
    ax.set_ylabel(f'{gene_label}+ve PT cells', fontweight='bold')

    plt.tight_layout()

    if save_path:
        fig.savefig(save_path, dpi=dpi, bbox_inches='tight')
        print(f"Saved figure to {save_path} at {dpi} dpi")

    plt.show()

    # report stats
    print(f"n_REF = {len(healthy_vals)}, n_CKD = {len(altered_vals)}")
    print(f"t-statistic = {t_stat:.3f}, two-tailed p = {pval:.3e}")


In [None]:
plot_per_sample_dots(
    'PCK1',
   ref_healthy_samples, ckd_healthy_samples ,
   ref_altered_samples, ckd_altered_samples,
    save_path='PCK_PT_ref_vs_ckd.png',
    dpi=300
)