# Global Analysis of Fragmentsize 

# Median Comparison between Cancer und Control 


tsv_dir = os.path.expanduser('/labmed/workspace/lotta/finaletoolkit/output_workflow')
results = []

for sample in all_samples:
    tsv_path = os.path.join(tsv_dir, '**', f"{sample}.frag_length_bins.tsv")
    files = glob.glob(tsv_path, recursive=True)
    if not files:
        print(f"TSV file for sample {sample} not found.")
        continue
    df = pd.read_csv(files[0], sep="\t")
    peak_index = df['count'].idxmax()
    peak_row = df.loc[peak_index]
    peak_length = (peak_row['min'] + peak_row['max']) / 2
    group = 'cancer' if sample in cancer_samples else 'control'
    results.append({'sample': sample, 'group': group, 'peak_length': peak_length, 'cancer_type': get_cancer_type(sample)})

results_frag_length_df = pd.DataFrame(results)
group_summary = results_frag_length_df.groupby('group')['peak_length'].mean().reset_index()
print("Mean Peak Lengths:")
print(group_summary)

# Visualisation of Results: Cancer vs. Control, Line Plot

In [None]:
tsv_dir = os.path.expanduser('/labmed/workspace/lotta/finaletoolkit/output_workflow')

groups = ['cancer', 'control']
avg_freq = {}

for group in groups:
    group_samples = results_frag_length_df[results_frag_length_df['group'] == group]['sample'].tolist()

    # Define dictionary before loop so we can accumulate counts per sample!
    combined_counts = {}
    
    for sample in group_samples:
        tsv_path = os.path.join(tsv_dir, '**', f"{sample}.frag_length_bins.tsv")
        files = glob.glob(tsv_path, recursive=True)
        if not files:
            continue
        df = pd.read_csv(files[0], sep="\t")

        df['frag_mid'] = (df['min'] + df['max']) / 2
        
        # Here we accumulate counts for each fragment length across samples --> we need the dicitonary defined outside the loop
        for frag, count in zip(df['frag_mid'], df['count']):
            combined_counts[frag] = combined_counts.get(frag, 0) + count

    # Average counts over number of samples in the group
    for frag in combined_counts:
        combined_counts[frag] /= len(group_samples)
    avg_freq[group] = pd.Series(combined_counts).sort_index()


plt.figure(figsize=(12,6))

for group, color in zip(groups, ['red', 'blue']):
    x = avg_freq[group].index.to_numpy(dtype=float)
    y = avg_freq[group].astype(float).to_numpy()

    plt.plot(x, y, label=f"{group} avg", color=color)

    peak = results_frag_length_df[results_frag_length_df['group'] == group]['peak_length'].mean()
    peak = float(peak)

    plt.axvline(peak, linestyle='--', color=color, alpha=0.7, label=f"{group} peak")


plt.xlabel("Fragment Length (bp)")
plt.ylabel("Average Count")
plt.title("Fragment Length Distribution: Cancer vs Control")
plt.legend()
plt.tight_layout()
plt.show()


# Visualisation of Results: Cancer vs. Control, Boxplot

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=results_frag_length_df,
    x='group',
    y='peak_length',
    palette={'cancer':'red', 'control':'blue'}
)

plt.xticks(rotation=45)
plt.ylabel("Peak Fragment Length")
plt.xlabel("Cancer Type")
plt.title("Distribution of Peak Fragment Lengths by Cancer Type and Group")
plt.legend(title='Group')
plt.tight_layout()
plt.show()

# Visualisation of Results: Individual Cancer Groups (Pancreatic, Colorectal..)


In [None]:
plt.figure(figsize=(12, 6))

sns.boxplot(
    data=results_frag_length_df,
    x='cancer_type',
    y='peak_length',
    palette="Set2"
)

plt.xticks(rotation=45)
plt.ylabel("Peak Fragment Length")
plt.xlabel("Cancer Type")
plt.title("Peak Fragment Length per Cancer Type")
plt.tight_layout()
plt.show()


# Matrix of Statistical Values of Cancer vs. Control Groups


In [None]:
if not os.path.exists("dataframes_for_ba/frag_length_metrics.parquet"):
    results_frag_length_df = pd.read_parquet("/labmed/workspace/lotta/finaletoolkit/dataframes_for_ba/frag_length_metrics.parquet")
else:
    results = []
    def get_cancer_type_fixed(sample):
        folder = find_sample_folder(sample)  
        if folder is None:
            return "Unknown"
        return os.path.basename(folder)  
    
    for sample in all_samples:
        tsv_path = os.path.join(tsv_dir, '**', f"{sample}.frag_length_bins.tsv")
        files = glob.glob(tsv_path, recursive=True)
        if not files:
            print(f"TSV file for sample {sample} not found.")
            continue
        df = pd.read_csv(files[0], sep="\t")
        vals = (df['min'] + df['max']) / 2
        counts = df['count']
        total_counts = counts.sum()

        mean_val = (vals * counts).sum() / total_counts

        df['cum_count'] = counts.cumsum()
        median_row = df[df['cum_count'] >= total_counts / 2].iloc[0]

        #median fragment length
        median_val = (median_row['min'] + median_row['max']) / 2

        # standard deviation
        std_val = ((counts * (vals - mean_val)**2).sum() / total_counts)**0.5
        min_val = df['min'].min()
        max_val = df['max'].max()

        # Mononucleosomal peak area (147-201bp), short fragment ratio (50-150bp), mono/di ratio (147-201bp / 310-370bp)
        mono_peak = df[(df['min'] >= 147) & (df['max'] <= 201)]['count'].sum()
        short_frag = df[(df['min'] >= 50) & (df['max'] <= 150)]['count'].sum()
        di_peak = df[(df['min'] >= 310) & (df['max'] <= 370)]['count'].sum()

        # Muss ich noch mit Paper "beweisen", aber 143 ist cacnerous peak und 167 healthy peak in den meisten Papern
        cancerous_peak_area = df[(df['min'] >= 130) & (df['max'] <= 150)]['count'].sum()
        healthy_peak_area = df[(df['min'] >= 160) & (df['max'] <= 170)]['count'].sum()
        ultra_short_peaks = df[(df['min'] >= 90) & (df['max'] <= 120)]['count'].sum()
        cancerous_peaks_ratio = cancerous_peak_area / total_counts
        healthy_peaks_ratio = healthy_peak_area / total_counts
        ultra_short_peaks_ratio = ultra_short_peaks / total_counts
        cancer_healthy_ratio = cancerous_peak_area / healthy_peak_area if healthy_peak_area > 0 else None

        # ALT fraction 
        short_alt_area =  df[df['max'] <= 150]['count'].sum()
        long_alt_area =  df[df['min'] > 150]['count'].sum()
        alt_ratio = short_alt_area / (short_alt_area + long_alt_area) 


        short_frag_ratio = short_frag / total_counts
        mono_di_ratio = mono_peak / di_peak if di_peak > 0 else None

        group = "cancer" if sample in cancer_samples else "control"

        results.append({
            "sample": sample,
            "group": group,
            "cancer_type": get_cancer_type_fixed(sample),
            "mean_fragment": mean_val,
            "median_fragment": median_val,
            "std_fragment": std_val,
            "min_fragment": min_val,
            "max_fragment": max_val,
            "mono_peak_area": mono_peak,
            "short_fragment_ratio": short_frag_ratio,
            "mono_di_ratio": mono_di_ratio,
            "cancerous_peaks_ratio": cancerous_peaks_ratio,
            "healthy_peaks_ratio": healthy_peaks_ratio,
            "cancer_healthy_ratio": cancer_healthy_ratio,
            "ultra_short_peaks_ratio": ultra_short_peaks_ratio,
            "alt_ratio": alt_ratio
        })

    results_frag_length_df = pd.DataFrame(results)
    results_frag_length_df.to_parquet("dataframes_for_ba/frag_length_metrics.parquet", index=False)
    print(f"Extracted metrics for {len(results_frag_length_df)} samples")

# Mean peak lengths of cancerous and healthy groups


In [None]:

results_frag_length_df.groupby('group')[['cancerous_peaks_ratio', 'healthy_peaks_ratio']].mean()

In [None]:
print(
    results_frag_length_df[['sample', 'group', 'cancerous_peaks_ratio', 'healthy_peaks_ratio']]
    .sort_values(by=['group', 'cancerous_peaks_ratio'])
    .to_string(index=False)
)


# Plotting Boxplots for Important Fragment Length Metrics


In [None]:
palette_colors = {'cancer': 'red', 'control': 'blue'}
fig, axes = plt.subplots(1, 5, figsize=(18, 6))

sns.boxplot(
    data=results_frag_length_df,
    x="group",
    y="cancer_healthy_ratio",
    ax=axes[0],
    hue="group",
    palette=palette_colors,
    dodge=False,
    legend=False,
)
axes[0].set_title('Cancer/Healthy Ratio')
axes[0].set_ylabel('Cancer / Healthy Ratio')

sns.boxplot(
    data=results_frag_length_df,
    x="group",
    y="cancerous_peaks_ratio",
    ax=axes[1],
    hue="group",
    palette=palette_colors,
    dodge=False,
    legend=False,
)
axes[1].set_title('Cancerous Peaks Ratio')
axes[1].set_ylabel('') 

sns.boxplot(
    data=results_frag_length_df,
    x="group",
    y="healthy_peaks_ratio",
    ax=axes[2],
    hue="group",
    palette=palette_colors,
    dodge=False,
    legend=False,
)
axes[2].set_title('Healthy Peaks Ratio')
axes[2].set_ylabel('') 

sns.boxplot(
    data=results_frag_length_df,
    x="group",
    y="ultra_short_peaks_ratio",
    ax=axes[3],
    hue="group",
    palette=palette_colors,
    dodge=False,
    legend=False,
)
axes[3].set_title('Ultra Short Peaks Ratio')
axes[3].set_ylabel('')

sns.boxplot(
    data=results_frag_length_df,
    x="group",
    y="alt_ratio",
    ax=axes[4],
    hue="group",
    palette=palette_colors,
    dodge=False,
    legend=False,
)
axes[4].set_title('Alt Ratio')
axes[4].set_ylabel('')

plt.tight_layout() 

plt.show()

In [None]:
print(results_frag_length_df.head())
frag_metrics_df_indexed = results_frag_length_df.set_index('sample')
frag_metrics_df_indexed.drop(columns=['group'], inplace=True)
print(frag_metrics_df_indexed.head())