In [2]:
import pandas as pd
import numpy as np

# Load data
file_path = "/Users/leazeiberts/Downloads/data/Brain - Amygdala_junctions.tsv"
df = pd.read_csv(file_path, sep="\t")

metadata_file_path = "/Users/leazeiberts/Master/Masterarbeit/patient_age.txt"
metadata = pd.read_csv(metadata_file_path, sep="\t")

# Split the "Name" column into Chromosome, Start, and End
df[['Chromosome', 'Start', 'End']] = df['Name'].str.extract(r'(chr\w+)_(\d+)_(\d+)')
df['Start'] = pd.to_numeric(df['Start'], errors='coerce')
df['End'] = pd.to_numeric(df['End'], errors='coerce')

# Ensure the metadata has no missing SUBJIDs and is ready for merging
metadata = metadata.dropna(subset=['SUBJID']).rename(columns={'SUBJID': 'Sample'})  # Rename for easier merging

# Drop unnecessary columns (e.g., Description) and rows with missing data
df = df.drop(columns=['Description']).dropna()

# Select sample columns (everything except Chromosome, Start, End)
sample_columns = df.columns.difference(['Name', 'Chromosome', 'Start', 'End'])

# Group by Chromosome and Start
grouped = df.groupby(['Chromosome', 'Start'])

# Normalize the data **within each group for each sample**
normalized_data = grouped[sample_columns].apply(
    lambda group: group.div(group.sum(axis=0).replace(0, 1e-10), axis=1)
)

# Function to calculate Shannon entropy
def shannon_entropy(probs):
    probs = probs[probs > 0]
    return max(0, -np.sum(probs * np.log2(probs)))

# Filter groups with total reads per sample (within same Chromosome and Start) >= 10
valid_groups = grouped[sample_columns].sum()
valid_groups = valid_groups[valid_groups.sum(axis=1) >= 10]

# Calculate entropy for valid groups
entropies = []
for (chromosome, start), group in normalized_data.groupby(level=[0, 1]):
    if (chromosome, start) in valid_groups.index:
        for sample in sample_columns:
            entropy = shannon_entropy(group[sample].values)
            entropies.append((chromosome, start, sample, entropy))

entropy_df = pd.DataFrame(entropies, columns=['Chromosome', 'Start', 'Sample', 'Entropy'])

# Extract base ID (e.g., GTEX-11ZU8) from detailed Sample IDs
entropy_df['Base_ID'] = entropy_df['Sample'].str.extract(r'(GTEX-\w+)')

# Merge with metadata using the base ID
merged_entropy_df = entropy_df.merge(metadata, left_on='Base_ID', right_on='Sample', how='left')

# Rename and clean columns after merging
merged_entropy_df = merged_entropy_df.rename(columns={'Sample_x': 'Sample'})
merged_entropy_df = merged_entropy_df.drop(columns=['Sample_y'])

# Sort by the correct columns
result_start = merged_entropy_df.sort_values(by=['Chromosome', 'Start', 'Sample'])
print(result_start.head(30))

# For calculations by End, repeat similar steps
grouped_end = df.groupby(['Chromosome', 'End'])

normalized_data_end = grouped_end[sample_columns].apply(
    lambda group: group.div(group.sum(axis=0).replace(0, 1e-10), axis=1)
)

valid_groups_end = grouped_end[sample_columns].sum()
valid_groups_end = valid_groups_end[valid_groups_end.sum(axis=1) >= 10]

entropies_end = []
for (chromosome, end), group in normalized_data_end.groupby(level=[0, 1]):
    if (chromosome, end) in valid_groups_end.index:
        for sample in sample_columns:
            entropy = shannon_entropy(group[sample].values)
            entropies_end.append((chromosome, end, sample, entropy))

entropy_end_df = pd.DataFrame(entropies_end, columns=['Chromosome', 'End', 'Sample', 'Entropy'])

# Extract base ID (e.g., GTEX-11ZU8) from detailed Sample IDs
entropy_end_df['Base_ID'] = entropy_end_df['Sample'].str.extract(r'(GTEX-\w+)')

# Merge with metadata using the base ID
merged_entropy_end_df = entropy_end_df.merge(metadata, left_on='Base_ID', right_on='Sample', how='left')

# Rename and clean columns after merging
merged_entropy_end_df = merged_entropy_end_df.rename(columns={'Sample_x': 'Sample'})
merged_entropy_end_df = merged_entropy_end_df.drop(columns=['Sample_y'])

# Sort by the correct columns
result_end = merged_entropy_end_df.sort_values(by=['Chromosome', 'End', 'Sample'])
print(result_end.head(30))


   Chromosome  Start                        Sample   Entropy     Base_ID  SEX  \
0        chr1  12058  GTEX-11ZU8-0011-R4a-SM-5BC6Y  0.918296  GTEX-11ZU8    1   
1        chr1  12058  GTEX-11ZVC-0011-R4a-SM-5BC6Z  0.000000  GTEX-11ZVC    2   
2        chr1  12058  GTEX-12WSA-0011-R4a-SM-57WB7  1.000000  GTEX-12WSA    1   
3        chr1  12058  GTEX-12WSD-0011-R4b-SM-5LZUA  0.000000  GTEX-12WSD    2   
4        chr1  12058  GTEX-12WSF-0011-R4b-SM-5HL88  0.000000  GTEX-12WSF    1   
5        chr1  12058  GTEX-12WSH-0011-R4a-SM-5GU6K  0.000000  GTEX-12WSH    1   
6        chr1  12058  GTEX-12ZZW-0011-R4a-SM-5DUX9  0.000000  GTEX-12ZZW    1   
7        chr1  12058  GTEX-12ZZZ-0011-R4b-SM-5DUV7  0.000000  GTEX-12ZZZ    1   
8        chr1  12058  GTEX-13112-0011-R4b-SM-5DUXL  0.000000  GTEX-13112    1   
9        chr1  12058  GTEX-1313W-0011-R4b-SM-5KLZV  0.000000  GTEX-1313W    2   
10       chr1  12058  GTEX-131XH-0011-R4b-SM-5DUWB  0.000000  GTEX-131XH    1   
11       chr1  12058  GTEX-1