In [1]:
# %% Import required libraries and modules
import logging
import os
import sys
import warnings

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join("..", "src")))

# Suppress all FutureWarning messages
warnings.simplefilter(action="ignore", category=FutureWarning)

import decoupler as dc
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

from evaluation import evaluate_model
from models import FlexibleFCNN
from preprocess import split_data
from training import train_model
from utils import create_dataloader, load_config, load_sampled_data


# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# Load Config
config = load_config("../config.yaml")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

# %% Load and Preprocess Datasets with TF Activity Inference
logging.info("Loading datasets and running TF activity inference...")

2025-02-03 08:19:03 [INFO] - Using device: cuda
2025-02-03 08:19:03 [INFO] - Loading datasets and running TF activity inference...


In [None]:
preprocess_gene_data(config, standardize=True, feature_space="best inferred", chunk_size=2500)

In [2]:
X = load_sampled_data(config["data_paths"]["preprocessed_best_inferred_file"])
X.shape

(5243, 10177)

In [None]:
import numpy as np

feature_space = "best inferred"

logging.debug("Loading datasets.")
X_rna = np.fromfile(config["data_paths"]["rna_file"], dtype=np.float32).reshape(
    -1, 12328
)
geneinfo = pd.read_csv(config["data_paths"]["geneinfo_file"], sep="\t")
y_df = pd.read_csv(config["data_paths"]["y_file"], delimiter="\t")
print(geneinfo.shape), print(y_df.shape)

# Select genes based on user's choice
if feature_space == "landmark":
    logging.debug("Using landmark genes.")
    selected_genes = geneinfo[geneinfo.feature_space == "landmark"]
elif feature_space == "best inferred":
    logging.debug("Using best inferred genes (including landmark genes).")
    selected_genes = geneinfo[
        geneinfo.feature_space.isin(["landmark", "best inferred"])
    ]
elif feature_space == "all":
    logging.debug("Using all genes.")
    selected_genes = geneinfo
else:
    raise ValueError("Invalid feature space selected.")

In [4]:
import numpy as np
X_rna = np.fromfile(config["data_paths"]["rna_file"], dtype=np.float32).reshape(
    -1, 12328
)

In [5]:
X_rna.shape


(63134, 12328)

In [6]:
# Turn X_rna into a pandas dataframe
X_rna_df = pd.DataFrame(X_rna, columns=geneinfo["gene_symbol"])

NameError: name 'geneinfo' is not defined

In [16]:
X_rna_df

gene_symbol,GAS8-AS1,ATXN8OS,XIST,INE1,FAM30A,LINC01587,HCG9,DSCR4,LINC01565,FAM215A,...,CCR2,TMEM242,SMIM27,ARMCX4,NBPF10,TIMM23,ZNF783,MICA,TMEM257,C10orf12
0,0.0,0.00,0.0,2.000,0.0,0.000,0.0,0.000,0.0,2.3125,...,0.0,0.000,0.0,1.875,0.0,0.000,0.0,0.000,0.0,2.000
1,0.0,2.25,0.0,0.000,0.0,2.125,0.0,1.875,0.0,0.0000,...,0.0,2.250,0.0,1.875,0.0,2.000,0.0,-1.875,0.0,-1.875
2,0.0,0.00,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,0.000,0.0,-1.875,0.0,-1.875,0.0,0.000,0.0,0.000
3,0.0,0.00,0.0,1.875,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000,0.0,-1.875
4,0.0,0.00,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,0.000,0.0,0.000,0.0,1.875,0.0,0.000,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63129,0.0,0.00,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,1.875,0.0,0.000,0.0,0.000,0.0,2.000,0.0,0.000
63130,0.0,0.00,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,1.875,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000
63131,0.0,0.00,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.0000,...,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000
63132,0.0,0.00,0.0,0.000,0.0,0.000,0.0,1.875,0.0,0.0000,...,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000,0.0,0.000
