In [None]:
import duckdb
import numpy
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.expand_frame_repr', False)  
pd.set_option('display.max_columns', None)

In [None]:
con = duckdb.connect(r"C:\Users\vigne\Desktop\Capstone\Datasets\Capstone_data_sql.duckdb")

In [None]:
allele_md= con.execute("select AlleleID,GeneID,ClinicalSignificance,Origin,Chromosome,ReviewStatus,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,Category,MC  from allele").fetchdf()

con.close()

In [None]:
allele_md.rename(columns = {"Category" : "VariantGeneRelation"},inplace = True)

# Drop rows where PositionVCF == -1 (2)
allele_md = allele_md[allele_md['PositionVCF'] != -1]
# Drop rows with ambiguous  alt alleles (8)
allele_md = allele_md[allele_md['AlternateAlleleVCF'].isin(['A', 'T', 'G', 'C'])]
# Drop row where chromosome is not found (only 1)
allele_md = allele_md[allele_md['Chromosome'] != 'Un']
allele_md.drop("PositionVCF",axis = 1 , inplace = True)


# Filtering data

### Based on clinical significance.
-  retaining only pathogenic , pathogenic with modifiers , benign , benign with modifiers → 39 labels → one hot encoding (patho 1).
- 3059288 -> 1320719

In [None]:

pathogenic_labels = [
    "Pathogenic",
    "Likely pathogenic",
    "Pathogenic/Likely pathogenic",
    "Pathogenic/Likely pathogenic/Pathogenic, low penetrance",
    "Pathogenic/Pathogenic, low penetrance; other",
    "Pathogenic/Likely pathogenic/Pathogenic, low penetrance; other",
    "Likely pathogenic, low penetrance",
    # Pathogenic with modifiers
    "Pathogenic; risk factor",
    "Pathogenic; other",
    "Pathogenic; Affects",
    "Pathogenic; drug response",
    "Pathogenic; association",
    "Pathogenic; confers sensitivity",
    "Pathogenic; association; protective",
    "Likely pathogenic; other",
    "Likely pathogenic; drug response",
    "Likely pathogenic; risk factor",
    "Likely pathogenic; Affects",
    "Likely pathogenic; association"
]

benign_labels = [
    "Benign",
    "Likely benign",
    "Benign/Likely benign",
    # Benign with modifiers
    "Benign; drug response",
    "Benign; risk factor",
    "Benign; other",
    "Benign; protective",
    "Benign; association",
    "Benign; Affects",
    "Benign; confers sensitivity",
    "Likely benign; drug response",
    "Likely benign; other",
    "Likely benign; protective",
    "Likely benign; risk factor",
    "Likely benign; Affects",
    "Likely benign; association",
    "Benign/Likely benign; other",
    "Benign/Likely benign; drug response",
    "Benign/Likely benign; other; risk factor",
    "Benign/Likely benign; risk factor"
]


label_map = {label: 1 for label in pathogenic_labels}
label_map.update({label: 0 for label in benign_labels})

allele_md = allele_md[allele_md['ClinicalSignificance'].isin(label_map.keys())].copy()
allele_md['ClinicalSignificance'] = allele_md['ClinicalSignificance'].map(label_map)

### Based on Review Status:
- 5 unique labels
    ```criteria provided, multiple submitters, no conflicts : 288542 
    criteria provided, single submitter : 973139 
    no assertion criteria provided : 49527 
    reviewed by expert panel : 9492 
    practice guidline : 19
- dropping :  no assertion criteria provided
- final length : 1271192

In [None]:
allele_md.drop(allele_md[allele_md["ReviewStatus"] == 'no assertion criteria provided'].index, inplace=True)
allele_md.drop(['ReviewStatus'],axis = 1,inplace = True)

### Multi-hot encoding of Molecular consequence column.
**The unique values are:**
- High Impact (Likely Pathogenic):
    -  nonsense - Creates stop codon
    - splice_donor_variant - Disrupts splicing
    - splice_acceptor_variant - Disrupts splicing
    - initiator_codon_variant - Affects translation start
    - stop_lost - Removes natural stop codon

- Moderate Impact:
    - missense_variant - Changes amino acid
    - 5_prime_UTR_variant - Affects regulation/translation
    - 3_prime_UTR_variant - Affects regulation/stability

- Low Impact (Likely Benign):
    - synonymous_variant - Silent change
    - intron_variant - Usually neutral
    - non-coding_transcript_variant - Variable impact
    - genic_upstream_transcript_variant - Distant regulatory
    - genic_downstream_transcript_variant - Distant regulatory
    - no_sequence_alteration - No change
```


array(['nonsense', 'non-coding_transcript_variant', 'missense_variant',
       'intron_variant', '5_prime_UTR_variant', 'splice_donor_variant',
       'synonymous_variant', 'splice_acceptor_variant',
       'initiator_codon_variant', '3_prime_UTR_variant',
       'no_sequence_alteration', 'stop_lost',
       'genic_upstream_transcript_variant',
       'genic_downstream_transcript_variant'], dtype=object)

In [None]:
allele_md['MC'] = allele_md['MC'].str.replace(r'SO:\d+\|', '', regex=True)
unique_MC = allele_md['MC'].str.split(',').explode().dropna().unique() 

In [None]:
for variant in unique_MC:
    allele_md[f'has_MC_{variant}'] = allele_md['MC'].str.contains(variant, na=False).astype(int)

In [None]:
allele_md.drop(['MC'],axis = 1,inplace = True)

###  Multi-hot encoding of Origin column
**The unique values are:**
- High Clinical Relevance:

    - de novo - New mutation, often more concerning
    - maternal/paternal - Inheritance pattern matters for some conditions

- Moderate Relevance:

    - germline - Constitutional variant
    - inherited - Familial variant
    - biparental - Both parents contribute

- Low/Neutral:

    - unknown, not-reported, tested-inconclusive - Lack of information
    - not applicable - Administrative

array(['germline', 'biparental', 'unknown', 'maternal', 'paternal',
       'inherited', 'de novo', 'not applicable', 'tested-inconclusive',
       'uniparental', 'not-reported'], dtype=object)

In [None]:
origin_types = allele_md['Origin'].str.split(';').explode().dropna().unique()

In [None]:
for origin in origin_types:
        allele_md[f'has_Origin_{origin}'] = allele_md['Origin'].str.contains(origin, na=False).astype(int)
    
    # Drop original column
allele_md = allele_md.drop(['Origin'], axis=1)