# CORE

In [3]:
import pandas as pd
import re

def parse_variant(variant):
    """
    Parse the variant string to extract numeric ranges and generate a list of values.
    """
    variant = variant.replace('del', '')
    parts = re.findall(r'\d+', variant)
    if len(parts) == 2:
        start, end = map(int, parts)
        return list(range(start, end + 1))
    elif len(parts) == 1:
        return [int(parts[0])]
    else:
        return []

def flatten_and_sort(lists):
    """
    Flatten a list of lists, remove duplicates, and sort in increasing order.
    """
    flat_list = []
    for sublist in lists:
        if isinstance(sublist, list):
            flat_list.extend(sublist)
        else:
            flat_list.append(sublist)
    return sorted(set(flat_list))

# Load the data
file_path = 'Figure 7 pheno w-o sg.csv'
df = pd.read_csv(file_path)

# Traits columns
traits = ['ALF-sc', 'Optic Atrophy-sc', 'OI-sc', 'Short Stature-sc', 'low-IG-sc', 'low-NK-sc']

# Initialize dictionaries to hold heterozygous and homozygous lists
heterozygous = {trait: [] for trait in traits}
homozygous = {trait: [] for trait in traits}
no_phenotype = {trait: [] for trait in traits}

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    protein_1 = row['Protein 1']
    protein_2 = row['Protein 2']
    
    for trait in traits:
        variant_1 = parse_variant(protein_1)
        variant_2 = parse_variant(protein_2)
        if row[trait] == 1:  # Check if the trait status is 1
            if variant_1 == variant_2:
                homozygous[trait].append(variant_1)
            else:
                heterozygous[trait].extend([variant_1, variant_2])
        elif row[trait] == 0:
            if variant_1 == variant_2:
                no_phenotype[trait].append(variant_1)
            else:
                no_phenotype[trait].extend([variant_1, variant_2])

# Flatten, remove duplicates, and sort the lists for each trait
for trait in traits:
    heterozygous[trait] = flatten_and_sort(heterozygous[trait])
    homozygous[trait] = flatten_and_sort(homozygous[trait])
    no_phenotype[trait] = flatten_and_sort(no_phenotype[trait])

# Splitting into two groups after flattening and flattening, removing duplicates, and sorting the resulting groups
for trait in traits:
    # Initialize dictionaries to hold variants split into two groups
    heterozygous_split = {trait: {'<1185': [], '>=1185': []}}
    homozygous_split = {trait: {'<1185': [], '>=1185': []}}
    no_phenotype_split = {trait: {'<1185': [], '>=1185': []}}
    
    # Splitting heterozygous variants
    for variant in heterozygous[trait]:
        if isinstance(variant, list):
            if len(variant) < 1185:
                heterozygous_split[trait]['<1185'].extend(variant)
            else:
                heterozygous_split[trait]['>=1185'].extend(variant)
        elif isinstance(variant, int):
            if variant < 1185:
                heterozygous_split[trait]['<1185'].append(variant)
            else:
                heterozygous_split[trait]['>=1185'].append(variant)
    
    # Splitting homozygous variants
    for variant in homozygous[trait]:
        if isinstance(variant, list):
            if len(variant) < 1185:
                homozygous_split[trait]['<1185'].extend(variant)
            else:
                homozygous_split[trait]['>=1185'].extend(variant)
        elif isinstance(variant, int):
            if variant < 1185:
                homozygous_split[trait]['<1185'].append(variant)
            else:
                homozygous_split[trait]['>=1185'].append(variant)
    
    # Splitting variants with no phenotype
    for variant in no_phenotype[trait]:
        if isinstance(variant, list):
            if len(variant) < 1185:
                no_phenotype_split[trait]['<1185'].extend(variant)
            else:
                no_phenotype_split[trait]['>=1185'].extend(variant)
        elif isinstance(variant, int):
            if variant < 1185:
                no_phenotype_split[trait]['<1185'].append(variant)
            else:
                no_phenotype_split[trait]['>=1185'].append(variant)
    
    # Flattening, removing duplicates, and sorting the resulting groups
    for group in ['<1185', '>=1185']:
        heterozygous_split[trait][group] = flatten_and_sort(heterozygous_split[trait][group])
        homozygous_split[trait][group] = flatten_and_sort(homozygous_split[trait][group])
        no_phenotype_split[trait][group] = flatten_and_sort(no_phenotype_split[trait][group])
    
    # Printing the results
    print(f"Trait: {trait}")
    for group in ['<1185', '>=1185']:
        print(f"Group: {group}")
        print(f"With no phenotype: {no_phenotype_split[trait][group]}")
        print(f"Heterozygous variants: {heterozygous_split[trait][group]}")
        print(f"Homozygous variants: {homozygous_split[trait][group]}")
        print()


Trait: ALF-sc
Group: <1185
With no phenotype: [103, 256, 343, 650, 671, 747]
Heterozygous variants: [1, 64, 136, 137, 151, 202, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 253, 340, 373, 396, 422, 517, 568, 845, 855, 856, 857, 858, 859, 892, 984, 1055, 1129]
Homozygous variants: [137, 254, 271, 519, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690