In [None]:
import pandas as pd
import scipy.io
import numpy as np

# Load expression matrix
matrix = scipy.io.mmread('../data/PBMC/PBMC3K_hg19/matrix.mtx')

# Load genes
genes = pd.read_csv('../data/PBMC/PBMC3K_hg19/genes.tsv', 
                    header=None, sep='\t', names=['gene_id', 'gene_symbol'])

# Load barcodes
barcodes = pd.read_csv('../data/PBMC/PBMC3K_hg19/barcodes.tsv', 
                       header=None, sep='\t', names=['barcode'])

# Transform sparse matrix into pandas sparse dataframe
expression_matrix = pd.DataFrame.sparse.from_spmatrix(matrix)
expression_matrix.index = genes['gene_symbol']
expression_matrix.columns = barcodes['barcode']

# Check basic information about the DataFrame
print(expression_matrix.info())

In [None]:
# Calculate the mean expression of each gene
mean_expression = expression_matrix.mean(axis=1)  # Mean per row (genes)

# Calculate the total expression of each cell
total_expression_per_cell = expression_matrix.sum(axis=0)  # Sum per column (cells)

# Count non-zero values
non_zero_counts = (expression_matrix != 0).sum(axis=1)  # Count of non-zero values per gene

print(mean_expression.head())
print(total_expression_per_cell.head())
print(non_zero_counts.head())

In [None]:
dense_matrix = expression_matrix.sparse.to_dense()

# For example, to calculate the standard deviation
std_dev_per_gene = dense_matrix.std(axis=1)

print(std_dev_per_gene.head())

In [9]:
import pandas as pd

# Leer el archivo TSV
df = pd.read_csv('../data/PBMC/new/pbmc_cell_type_signature_continuous_profile.tsv', sep='\t')

# Mostrar las primeras 5 filas
print("Primeras 5 filas del archivo:")
print(df.head(10))

# Mostrar información sobre las columnas
print("\nInformación de las columnas:")
print(df.info())

# Mostrar los nombres de las columnas
print("\nNombres de las columnas:")
print(df.columns.tolist())

Primeras 5 filas del archivo:
  Gene_symbol  B_CELLS_NAIVE  B_CELLS_MEMORY  PLASMA_CELLS  T_CELLS_CD8  \
0       ABCB4     555.713449       10.744235      7.225819     4.311280   
1       ABCB9      15.603544       22.094787    653.392328    24.223723   
2       ACAP1     215.305951      321.621021     38.616872  1055.613378   
3        ACHE      15.117949       16.648847     22.123737    13.428288   
4        ACP5     605.897384     1935.201479   1120.104684   306.312519   
5      ADAM28    1943.742699     1148.120138    324.780800    22.689718   
6    ADAMDEC1     371.033593      318.478799    127.967448    44.616287   
7     ADAMTS3     146.195568      106.052311     74.339169    42.390416   
8       ADRB2     486.343816      510.081340    289.798450   899.648501   
9        AIF1      24.074298       20.321859     21.969573   742.815819   

   T_CELLS_CD4_NAIVE  T_CELLS_CD4_MEMORY_RESTING  \
0           4.605860                    7.406442   
1          35.671507                   3

In [11]:
# Abrir el archivo GMT
with open('../data/PBMC/new/pbmc_cell_type_signature_gene_sets.gmt', 'r') as file:
    # Leer las primeras 5 líneas
    for i, line in enumerate(file):
        if i < 5:
            # Dividir la línea en sus componentes
            parts = line.strip().split('\t')
            
            # Mostrar el nombre del conjunto de genes y los primeros 5 genes
            print(f"Conjunto de genes: {parts[0]}")
            print(f"Descripción: {parts[1]}")
            print(f"Primeros 5 genes: {', '.join(parts[2:7])}")
            print()
        else:
            break

# Contar el número total de conjuntos de genes
with open('../data/PBMC/new/pbmc_cell_type_signature_gene_sets.gmt', 'r') as file:
    num_sets = sum(1 for line in file)

print(f"Número total de conjuntos de genes: {num_sets}")

Conjunto de genes: B_CELLS_MEMORY
Descripción: B_CELLS_MEMORY
Primeros 5 genes: AIM2, BANK1, BLK, CD19, CD27

Conjunto de genes: B_CELLS_NAIVE
Descripción: B_CELLS_NAIVE
Primeros 5 genes: BACH2, BANK1, CCR7, CD19, CD22

Conjunto de genes: DENDRITIC_CELLS_ACTIVATED
Descripción: DENDRITIC_CELLS_ACTIVATED
Primeros 5 genes: ACP5, BCL2A1, BIRC3, CCL13, CCL17

Conjunto de genes: DENDRITIC_CELLS_RESTING
Descripción: DENDRITIC_CELLS_RESTING
Primeros 5 genes: ACP5, AIF1, C1ORF54, CCL13, CCL22

Conjunto de genes: EOSINOPHILS
Descripción: EOSINOPHILS
Primeros 5 genes: BCL2A1, C3AR1, CCL4, CCR3, CD69

Número total de conjuntos de genes: 22


In [None]:
import pandas as pd
import scipy.sparse
import joblib

# Ruta al archivo CSV
csv_path = "../data/Neuronal/M1/matrix.csv"

def read_and_process_csv_in_chunks(csv_path, chunk_size=10000, output_file='../data/Neuronal/M1/sparse_dataframe.joblib'):
    """
    Read a large CSV file in chunks, transpose, convert to sparse, and concatenate.

    Parameters:
    - csv_path: str, path to the CSV file.
    - chunk_size: int, number of rows to read in each chunk.

    Returns:
    - sparse_expression_matrix: Sparse DataFrame containing the full gene expression data.
    """
    # Initialize a list to store processed chunks
    processed_chunks = []
    genes = None
    barcodes = []
    print(f"Empezamos", flush=True)
    
    # Read the CSV in chunks
    for chunk in pd.read_csv(csv_path, sep=',', chunksize=chunk_size,  header=0, index_col=0):
        print(f"Processing chunk rows {chunk.index[0]} to {chunk.index[-1]}...", flush=True)

        # Keep genes
        if genes is None:
            genes = chunk.columns.tolist()
        
        # Keep barcodes
        barcodes.extend(chunk.index.tolist())
        
        # Transpose the chunk so rows are genes and columns are barcodes
        chunk = chunk.T
        
        # Convert to Sparse DataFrame
        sparse_chunk = scipy.sparse.csr_matrix(chunk)
        
        # Add the processed chunk to the list
        processed_chunks.append(sparse_chunk)
        
        # Free memory from the current chunk
        del chunk  # Optional, to ensure the original chunk memory is freed
        
    # Concatenate all processed chunks into a single Sparse DataFrame
    full_sparse_matrix = scipy.sparse.vstack(processed_chunks)
    sparse_expression_matrix = pd.DataFrame.sparse.from_spmatrix(
        full_sparse_matrix,
        index=genes,
        columns=barcodes
    )

    joblib.dump(sparse_expression_matrix, output_file)
    
    # Return the complete Sparse DataFrame
    return sparse_expression_matrix

def load_sparse_dataframe(input_file='../data/Neuronal/M1/sparse_dataframe.joblib'):
    return joblib.load(input_file)


# Read and process the CSV in chunks
sparse_matrix = read_and_process_csv_in_chunks(csv_path)

sparse_dataframe = load_sparse_dataframe('../data/Neuronal/M1/sparse_dataframe.joblib')

# Show column names (First 10)
print("Column Names Matrix (First 10):", sparse_matrix.columns[:10])

# Show head of the Sparse Expression Matrix
print("\nHead of the Sparse Expression Matrix:")
print(sparse_matrix.head())

# Show data types of the Sparse Expression Matrix
print("\nData Types of the Sparse Expression Matrix:")
print(sparse_matrix.dtypes)

# Show shape of the Sparse Expression Matrix
print("\nShape of the Sparse Expression Matrix:", sparse_matrix.shape)

# Mostrar información
print("Dimensiones de la matriz:", sparse_dataframe.shape)
print("\nPrimeros 5 genes:", sparse_dataframe.index[:5])
print("\nPrimeros 5 barcodes:", sparse_dataframe.columns[:5])
print("\nMuestra de la matriz:")
print(sparse_dataframe.iloc[:5, :5])

Empezamos
Processing chunk rows AAACCCAAGGATTTCC-LKTX_190129_01_A01 to ATCGCCTTCACTGGGC-LKTX_190129_01_C01...
Processing chunk rows ATCGCCTTCGAGAAAT-LKTX_190129_01_C01 to CATCCACAGGCCGCTT-LKTX_190129_01_E01...
Processing chunk rows CATCCACCAAATGCGG-LKTX_190129_01_E01 to TTTCGATCACCATTCC-LKTX_190129_01_F01...
Processing chunk rows TTTCGATCATACATCG-LKTX_190129_01_F01 to TCGCTTGGTTTACGTG-LKTX_190130_01_A01...
