# Intro

This script serves two functions:

1.) Given two csv files (count matrix and metadata), combine them into a single Scanpy/AnnData object.

2.) Extract all the data (count matrix and metadata) from an existing Scanpy/AnnData object to two csv files,
which can then be imported into the script "Convert_seurat_csv.R". In that R script,
the two csv files can be combined to form a single Seurat/rds object.

# Load Packages

In [1]:
# Import packages
import pandas as pd
import numpy as np
import scanpy as sc
import anndata

# First Function: CSV to Scanpy

In [2]:
# Build a function that takes in two csv files and makes a Scanpy object
def csv_to_scanpy(mtx_path, meta_path, transpose = False, umap = False, cols_to_factorize = None,
                  save_name = None):
    """
    This function takes in two csv files (mtx and meta) and converts them to a Scanpy/AnnData object (Adata).
    
    Inputs
    ------
    mtx_path: A string value that provides the file path and name of the count matrix.
    meta_path: A string value that provides the file path and name of the metadata.
    transpose: A boolean (True/False) value. If True, then transpose the count matrix after loading.
    umap: A boolean (True/False) value. If True, add the UMAP coordinates from the metadata to the Adata object.
    cols_to_factorize: A Python list of metadata columns. If provided, these columns will become
                       categorical data.
    save_name: The file name to save the Adata object to disk.
                       
    Outputs
    -------
    Adata: The Scanpy/AnnData object that was created.
    """
    
    
    # Load the count matrix, with cells as rows and genes as columns
    print('Loading count matrix...')
    mtx = pd.read_csv(mtx_path, index_col = 0)
    print('Loaded count matrix.\n')
    
    # If rows are actually genes, then transpose the matrix
    if transpose == True:
        mtx = mtx.T
        print('Transposed count matrix.\n')
        
    # Load the metadata
    print('Loading metadata...')
    meta = pd.read_csv(meta_path, index_col = 0)
    print('Loaded metadata.\n')
    
    # Make sure there are the same number of cells in the count matrix and metadata
    error_message = "Number of cells in matrix is not equal to number of cells in metadata."
    assert len(mtx) == len(meta), error_message
    
    # Make sure all the cells in the matrix are the same as those in the metadata
    error_message = "Cells in matrix are not equal to cells in the metadata."
    assert np.all(mtx.index == meta.index), error_message
    
    # If both checks are passed, proceed to creating a Scanpy object (called Adata)
    Adata = anndata.AnnData(mtx)
    print('Created AnnData object.\n')
    
    # Add metadata
    for col in meta.columns:
        Adata.obs[col] = meta[col]
        
    # Add UMAP coordinates, if specified
    if umap == True:
        
        # Get the UMAP coordinates as a numpy array, add to Adata object
        Adata.obsm['X_umap'] = np.array(Adata.obs[['UMAP_1', 'UMAP_2']])
        print('Added UMAP coordinates.\n')
        
    # Turn specified columns into categorical data
    if cols_to_factorize is not None:
        
        for col in cols_to_factorize:
            Adata.obs[col] = Adata.obs[col].astype('category')
            
    # Save the Adata object to disk, if specified
    if save_name is not None:
        Adata.write_h5ad(save_name)
        print('Saved Adata to {}'.format(save_name))
            
    # Return Adata object
    return(Adata)

In [None]:
# Example of how to use the function

# Define variables
mtx_path = 'path/to/mtx.csv'
meta_path = 'path/to/meta.csv'
cols_to_factorize = ['Sample', 'Matching', 'Seurat_clusters']
save_name = '/path/to/Adata/h5ad'

# Run function
Adata = csv_to_scanpy(mtx_path  = mtx_path,
                      meta_path = meta_path,
                      transpose = False,
                      umap = True,
                      cols_to_factorize = cols_to_factorize,
                      save_name = save_name)

# Second Function: Scanpy to CSV

In [3]:
# Build a function that takes a Scanpy object and outputs two csv files
def scanpy_to_csv(Adata, mtx_path, meta_path, transpose = False, umap = False, return_matrices = False):
    """
    This function takes in a Scanpy/Anndata object and outputs two csv files (mtx and meta).
    
    Inputs
    ------
    Adata: The Scanpy/Anndata object of interest.
    mtx_path: A string value of the path/file name to store the count matrix.
    meta_path: A string value of the path/file name to store the metadata.
    transpose: A boolean (True/False) value. If True, the count matrix will be transposed before saving.
    umap: A boolean (True/False) value. If True, the UMAP coordinates will be added to the metadata.
    return_matrices: A boolean (True/False) value. If True, return both count matrix and metadata.
    
    Outputs
    -------
    Returns None.
    """
    
    # Create count matrix as pandas dataframe
    mtx = pd.DataFrame(index = Adata.obs_names, columns = Adata.var_names, data = Adata.X)
    
    # Transpose count matrix, if specified
    if transpose == True:
        mtx = mtx.T
        print('Transposed count matrix.\n')
        
    # Add UMAP to metadata, if specified
    if umap == True:
        Adata.obs[['UMAP_1', 'UMAP_2']] = Adata.obsm['X_umap']
        print('Added UMAP coordinates to metadata.\n')
        
    # Write out the data
    print('Saving count matrix...')
    mtx.to_csv(mtx_path)
    print('Saved count matrix to {}\n'.format(mtx_path))
    
    # Write out the metadata
    print('Saving metadata...')
    Adata.obs.to_csv(meta_path)
    print('Saved metadata to {}\n'.format(meta_path))
    
    # Return matrices if specified
    if return_matrices == True:
        print('Returning count matrix and metadata.')
        return(mtx,Adata.obs)
    
    # Otherwise, nothing to return
    else:
        print('Return nothing.')
        return(None)

In [None]:
# Example of how to use the function

# Define variables
mtx_path = 'path/to/mtx.csv'
meta_path = 'path/to/meta.csv'

# Run function
mtx, meta = scanpy_to_csv(Adata,
                          mtx_path  = mtx_path,
                          meta_path = meta_path,
                          transpose = False,
                          return_matrices = True)

# Pipeline: Create Scanpy Objects for Analysis

### Define Samples

In [4]:
# Define samples
samples = ['M1_Blood', 'M2_Blood', 'M3_Blood', 'M4_Blood', 'M5_Blood',
           'M1_Tumor', 'M2_Tumor', 'M3_Tumor', 'M4_Tumor', 'M5_Tumor',
           'MouseIntegratedBlood', 'MouseIntegratedTumor',
           'K409_Blood', 'K409_Tumor', 'K409_LN',
           'K411_Blood', 'K411_Tumor', 'K411_Blood_Longitudinal',
           'K468_Blood', 'K468_Tumor', 'K468_Blood_Longitudinal',
           'K484_Blood', 'K484_Tumor',
           'HumanIntegratedBlood', 'HumanIntegratedTumor',
           'HumanIntegratedBlood_wLongitudinal']

### Loop Over Samples, Make Scanpy Objects

In [5]:
# Loop over samples
for i,sample in enumerate(samples):
    
    print(sample)
    
    # Define matrix path
    mtx_path = 'CountMatrices/' + sample + '_mtx.csv'
    
    # Define metadata path
    meta_path = 'Metadata/' + sample + '_meta.csv'
    
    # Factorize certain columns
    cols_to_factorize = ['Sample', 'Matching', 'Seurat_clusters', 'TCR', 'Matching_pre_filter']
    
    # Define file path to save Scanpy objects
    save_name = 'Scanpyobjects/' + sample + '.h5ad'
    
    # Run function
    Adata = csv_to_scanpy(mtx_path  = mtx_path,
                          meta_path = meta_path,
                          transpose = False,
                          umap = True,
                          cols_to_factorize = None,
                          save_name = save_name)

M1_Blood
Loading count matrix...
Loaded count matrix.

Loading metadata...
Loaded metadata.

Created AnnData object.

Added UMAP coordinates.

Saved Adata to Scanpyobjects/M1_Blood.h5ad
M2_Blood
Loading count matrix...
Loaded count matrix.

Loading metadata...
Loaded metadata.

Created AnnData object.

Added UMAP coordinates.

Saved Adata to Scanpyobjects/M2_Blood.h5ad
M3_Blood
Loading count matrix...
Loaded count matrix.

Loading metadata...
Loaded metadata.

Created AnnData object.

Added UMAP coordinates.

Saved Adata to Scanpyobjects/M3_Blood.h5ad
M4_Blood
Loading count matrix...
Loaded count matrix.

Loading metadata...
Loaded metadata.

Created AnnData object.

Added UMAP coordinates.

Saved Adata to Scanpyobjects/M4_Blood.h5ad
M5_Blood
Loading count matrix...
Loaded count matrix.

Loading metadata...
Loaded metadata.

Created AnnData object.

Added UMAP coordinates.

Saved Adata to Scanpyobjects/M5_Blood.h5ad
M1_Tumor
Loading count matrix...
Loaded count matrix.

Loading metadat