In [None]:
import duckdb
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
pd.set_option('display.max_colwidth', None)  
pd.set_option('display.expand_frame_repr', False)  
pd.set_option('display.max_columns', None)

In [None]:
con = duckdb.connect(r"C:\Users\vigne\Desktop\Capstone\datasets\model_train_data.duckdb")
adf=con.execute("select * from allele").fetch_df()
con.close()

# Majority Undersampling
- We have way more benign comapared to pathogenic. 
- undersampling benign

In [None]:
# Balance classes first
pathogenic_df = adf[adf['ClinicalSignificance'] == 1]
benign_df = adf[adf['ClinicalSignificance'] == 0]
benign_sampled = benign_df.sample(n=len(pathogenic_df), random_state=42)
balanced_df = pd.concat([pathogenic_df, benign_sampled])

In [None]:
# Features (exclude IDs and target)
feature_cols_allele = [col for col in balanced_df.columns if col not in ['AlleleID', 'ClinicalSignificance','GeneID']]

X = balanced_df[feature_cols_allele]
y = balanced_df['ClinicalSignificance']

# Split: 70% train, 15% validation, 15% test
trainx, X_temp, trainy, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
valx, testx, valy, testy = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# SENN

## Model

### Conceptizer : Identity

In [None]:
class IdentityConceptizer(nn.Module):
    """
       - Does absolutely nothing conceptually - just adds then removes a dummy dimension
       - Kept only for maintaining proper SENN interface/flow
       - If in the future you want to try other conceptizers, the necessary structure is present
       - Note: Makes reconstruction loss meaningless (always ~0 since recon_x == original input)
   """
    def __init__(self, **kwargs) :
        super().__init__()
    
    def forward(self, x):
        encoded = self.encode(x)
        decoded = self.decode(encoded)
        return encoded, decoded
    
    def encode(self, x):
        return x.unsqueeze(-1)  # (BATCH, FEATURES, 1)
    
    def decode(self, z):
        return z.squeeze(-1) # (BATCH, FEATURES)


In [None]:
class LinearParameterizer(nn.Module):
    """
        - Hidden layers by default: 128, 64, 32 -> achieved 93% test accuracy
        - Custom hidden_sizes can be provided for experimentation
        - Takes raw input features (not concepts so -> called with x or concepts.squeeze) since IdentityConceptizer makes them equivalent
    """
    def __init__(self, num_features, num_concepts, num_classes, hidden_sizes=None, dropout=0.3):
        super().__init__()
        self.num_concepts = num_concepts
        self.num_classes = num_classes
        
        # Default hidden sizes if not provided
        if hidden_sizes is None:
            hidden_sizes = [num_features, 128, 64, 32, num_concepts * num_classes]
        else:
            hidden_sizes = [num_features] + list(hidden_sizes) + [num_concepts * num_classes]
        
        layers = []
        for h, h_next in zip(hidden_sizes[:-1], hidden_sizes[1:]):
            layers.append(nn.Linear(h, h_next))
            if h_next != hidden_sizes[-1]:  
                layers.append(nn.Dropout(dropout))
                layers.append(nn.ReLU())
        
        self.layers = nn.Sequential(*layers)
    