In [4]:
import loompy
import pandas as pd

# Load the loom file
loom_file_path = 'stem.loom'
ds = loompy.connect(loom_file_path)

# Extract row attributes
genes = ds.ra['Gene']
start_positions = ds.ra['Start']
end_positions = ds.ra['End']

# Calculate gene lengths
gene_lengths = end_positions - start_positions

# Create a DataFrame with genes and their lengths
gene_length_df = pd.DataFrame({
    'Gene': genes,
    'Length': gene_lengths
})

# Save to a CSV file
gene_length_csv_path = 'stem_gene_lengths.csv'
gene_length_df.to_csv(gene_length_csv_path, index=False)

# Close the loom file
ds.close()


In [5]:
import loompy

# Open the loom file
loom_file_path = 'stem.loom'
with loompy.connect(loom_file_path) as ds:
    # Extract the count matrix
    count_matrix = ds[:, :]
    # Optionally, you can also extract row and column attributes (e.g., gene names and cell barcodes)
    gene_names = ds.ra['Gene']
    cell_barcodes = ds.ca['CellID']

# Display the shape of the count matrix
print(f"Count matrix shape: {count_matrix.shape}")

import pandas as pd

# Convert to Pandas DataFrame
df = pd.DataFrame(count_matrix, index=gene_names, columns=cell_barcodes)

# Save to a CSV file
df.to_csv('stem_count_matrix.csv')

Count matrix shape: (43682, 27)


In [6]:
import pandas as pd
import numpy as np

# Load the count matrix
count_matrix_path = 'stem_count_matrix.csv'
count_matrix = pd.read_csv(count_matrix_path, index_col=0)

# Load gene lengths
gene_lengths_path = 'stem_gene_lengths.csv'  # CSV with 'Gene' and 'Length' columns
gene_lengths = pd.read_csv(gene_lengths_path, index_col='Gene')

# Ensure the gene lengths match the genes in the count matrix
common_genes = count_matrix.index.intersection(gene_lengths.index)
count_matrix = count_matrix.loc[common_genes]
gene_lengths = gene_lengths.loc[common_genes]

# Calculate total mapped reads for each sample
total_mapped_reads = count_matrix.sum(axis=0)

# Calculate FPKM
fpkm_matrix = count_matrix.div(gene_lengths['Length'], axis=0) * 1e9
fpkm_matrix = fpkm_matrix.div(total_mapped_reads, axis=1)

# Save the FPKM matrix to a new CSV file
fpkm_matrix.to_csv('stem_fpkm_matrix.csv')


In [None]:
import loompy
import pandas as pd

# Step 1: Extract Gene Lengths from Loom File
loom_file_path = 'all_lib.loom'

# Open the Loom file and extract gene lengths
with loompy.connect(loom_file_path) as ds:
    genes = ds.ra['Gene']
    start_positions = ds.ra['Start']
    end_positions = ds.ra['End']
    
    # Calculate gene lengths (in base pairs)
    gene_lengths = end_positions - start_positions
    
    # Create a DataFrame with genes and their lengths
    gene_length_df = pd.DataFrame({
        'Gene': genes,
        'Length (bp)': gene_lengths
    })
    
    # Save to a CSV file
    gene_length_csv_path = 'meso_gene_lengths.csv'
    gene_length_df.to_csv(gene_length_csv_path, index=False)

# Step 2: Extract Count Matrix from Loom File
with loompy.connect(loom_file_path) as ds:
    # Extract the count matrix
    count_matrix = ds[:, :]
    
    # Extract row and column attributes
    gene_names = ds.ra['Gene']
    cell_barcodes = ds.ca['CellID']
    
    # Convert to Pandas DataFrame
    count_df = pd.DataFrame(count_matrix, index=gene_names, columns=cell_barcodes)
    
    # Save to a CSV file
    count_csv_path = 'meso_count_matrix.csv'
    count_df.to_csv(count_csv_path)

# Step 3: Calculate and Save FPKM Values
# Load the count matrix and gene lengths
count_matrix = pd.read_csv('meso_count_matrix.csv', index_col=0)
gene_lengths = pd.read_csv('meso_gene_lengths.csv', index_col='Gene')

# Ensure matching genes between count matrix and gene lengths
common_genes = count_matrix.index.intersection(gene_lengths.index)
count_matrix = count_matrix.loc[common_genes]
gene_lengths = gene_lengths.loc[common_genes]

# Convert gene lengths from base pairs (bp) to kilobases (Kb)
gene_lengths_kb = gene_lengths['Length (bp)'] / 1000

# Total mapped reads per sample
total_mapped_reads = count_matrix.sum(axis=0)

# FPKM Calculation
fpkm_matrix = count_matrix.div(gene_lengths_kb, axis=0)  # Normalize by gene length in Kb
fpkm_matrix = fpkm_matrix.div(total_mapped_reads / 1e6, axis=1)  # Normalize by total reads in millions

# Save the FPKM matrix to a CSV file
fpkm_matrix.to_csv('new_meso_fpkm_matrix.csv')

# Outputs for checking
print("Gene lengths (in Kb, first few rows):")
print(gene_lengths_kb.head())

print("Total mapped reads (first few samples):")
print(total_mapped_reads.head())

In [None]:
import loompy
import pandas as pd

# Step 1: Extract Gene Lengths from Loom File
loom_file_path = 'all_lib.loom'

# Open the Loom file and extract gene lengths
with loompy.connect(loom_file_path) as ds:
    genes = ds.ra['Gene']
    start_positions = ds.ra['Start']
    end_positions = ds.ra['End']
    
    # Calculate gene lengths (in base pairs)
    gene_lengths = end_positions - start_positions
    
    # Create a DataFrame with genes and their lengths
    gene_length_df = pd.DataFrame({
        'Gene': genes,
        'Length (bp)': gene_lengths
    })
    
    # Save to a CSV file
    gene_length_csv_path = 'meso_gene_lengths.csv'
    gene_length_df.to_csv(gene_length_csv_path, index=False)

# Step 2: Extract Count Matrix from Loom File
with loompy.connect(loom_file_path) as ds:
    # Extract the count matrix
    count_matrix = ds[:, :]
    
    # Extract row and column attributes
    gene_names = ds.ra['Gene']
    cell_barcodes = ds.ca['CellID']
    
    # Convert to Pandas DataFrame
    count_df = pd.DataFrame(count_matrix, index=gene_names, columns=cell_barcodes)
    
    # Save to a CSV file
    count_csv_path = 'meso_count_matrix.csv'
    count_df.to_csv(count_csv_path)

# Step 3: Calculate and Save FPKM Values
# Load the count matrix and gene lengths
count_matrix = pd.read_csv('meso_count_matrix.csv', index_col=0)
gene_lengths = pd.read_csv('meso_gene_lengths.csv', index_col='Gene')

# Ensure matching genes between count matrix and gene lengths
common_genes = count_matrix.index.intersection(gene_lengths.index)
count_matrix = count_matrix.loc[common_genes]
gene_lengths = gene_lengths.loc[common_genes]

# Convert gene lengths from base pairs (bp) to kilobases (Kb)
gene_lengths_kb = gene_lengths['Length (bp)'] / 1000

# Total mapped reads per sample
total_mapped_reads = count_matrix.sum(axis=0)

# FPKM Calculation
fpkm_matrix = count_matrix.div(gene_lengths_kb, axis=0)  # Normalize by gene length in Kb
fpkm_matrix = fpkm_matrix.div(total_mapped_reads / 1e6, axis=1)  # Normalize by total reads in millions

# Save the FPKM matrix to a CSV file
fpkm_matrix.to_csv('new_meso_fpkm_matrix.csv')

# Outputs for checking
print("Gene lengths (in Kb, first few rows):")
print(gene_lengths_kb.head())

print("Total mapped reads (first few samples):")
print(total_mapped_reads.head())