In [None]:
import numpy as np
import pandas as pd

# get experiments details including viability per sample ; output Y
siginfo = pd.read_csv("../data/raw/Y.tsv", sep="\t", index_col=0)

# remove NaNs
na_idxs = siginfo.viability.isna()
siginfo = siginfo.loc[~na_idxs]

# get gene expressions ; input X_gene
X_rna = np.fromfile("../data/raw/X_RNA.bin", dtype=np.float64)
X_rna = X_rna.reshape(31567, -1)  # reshape loaded data so we can use it

print(X_rna.shape)


(31567, 24656)


In [8]:
# Load gene annotations
geneinfo = pd.read_csv("../data/raw/geneinfo_beta.txt", sep="\t")

# Select genes that are either "landmark" or "best inferred"
selected_genes = geneinfo.feature_space.isin(["landmark", "best inferred", "inferred"])

# Filter the X_rna array to include only the selected genes
X_landmark = X_rna[:, selected_genes]

# Create DataFrame for selected genes
X_landmark_df = pd.DataFrame(
    X_landmark, index=na_idxs.index, columns=geneinfo.loc[selected_genes, "gene_symbol"]
)

# Remove rows with NaN indices
X_landmark_df = X_landmark_df.loc[~na_idxs]

In [9]:
# Print the shape of the resulting DataFrame
print(f"Shape of the DataFrame with selected genes: {X_landmark_df.shape}")

# Sanity checks: Calculate the number and percentage of 0.0 values
num_zeros = (X_landmark_df == 0.0).sum().sum()
total_elements = X_landmark_df.size

# Compute the percentage of 0.0 values
percentage_zeros = (num_zeros / total_elements) * 100

print(f"Number of 0.0 values: {num_zeros}")
print(f"Total number of elements: {total_elements}")
print(f"Percentage of 0.0 values: {percentage_zeros:.2f}%")

Shape of the DataFrame with selected genes: (31159, 12328)
Number of 0.0 values: 299807961
Total number of elements: 384128152
Percentage of 0.0 values: 78.05%


In [10]:
df = X_landmark_df

# Check if all values in each column are 0.0
all_zeros_columns = (df == 0.0).all()

# Count the number of columns with only 0.0 values
num_all_zeros_columns = all_zeros_columns.sum()

# Get the names of columns with only 0.0 values
columns_all_zeros = all_zeros_columns[all_zeros_columns].index.tolist()

# Calculate the percentage of columns with only 0.0 values
percentage = (num_all_zeros_columns / df.shape[1]) * 100

print(f"Number of columns with only 0.0 values: {num_all_zeros_columns}")
print(f"Columns with only 0.0 values: {columns_all_zeros}")
print(f"Percentage of columns with only 0.0 values: {percentage:.2f}%")

Number of columns with only 0.0 values: 0
Columns with only 0.0 values: []
Percentage of columns with only 0.0 values: 0.00%


In [11]:
df = X_landmark_df

# Check if all values in each row are 0.0
all_zeros_rows = (df == 0.0).all(axis=1)

# Count the number of rows with only 0.0 values
num_all_zeros_rows = all_zeros_rows.sum()

# Get the names (index labels) of the rows with only 0.0 values
rows_all_zeros = all_zeros_rows[all_zeros_rows].index.tolist()

# Calculate the percentage of rows with only 0.0 values
percentage_rows = (num_all_zeros_rows / df.shape[0]) * 100

print(f"Number of rows with only 0.0 values: {num_all_zeros_rows}")
print(f"Rows with only 0.0 values: {rows_all_zeros}")
print(f"Percentage of rows with only 0.0 values: {percentage_rows:.2f}%")

Number of rows with only 0.0 values: 0
Rows with only 0.0 values: []
Percentage of rows with only 0.0 values: 0.00%
