In [17]:
#Adopting smartseq3 data (MoldEtAl, "Clonally heritable gene expression imparts a layer of diversity within cell types",bioRxiv 2022) data for ZhangMelzerEtAl 2023
#Last updated 20231205 by Madeline E Melzer


In [1]:
#import csv

#import gzip
import os
import pandas as pd

#import scipy.io
#import pyreadr

In [2]:
inputDirectory = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/umi_count/"
outputDirectory = inputDirectory
outputDirectory2 = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/read_count_matrices/formatted/"

#format_umi_count_matrices(inputDirectory, outputDirectory)
expand_rows_based_on_count(outputDirectory, outputDirectory)

#append_csv_files(outputDirectory, os.path.join(outputDirectory2, "all_brains_read_count_matrix.csv"))



NameError: name 'expand_rows_based_on_count' is not defined

In [14]:
#filtering barcodes with "0" (deleted bases) or "-" (missing bases)

dataframe = pd.read_csv(os.path.join(outputDirectory2, "all_brains_read_count_matrix.csv"))
filtered_df = dataframe[~dataframe['barcode'].str.contains('0') & ~dataframe['barcode'].str.contains('-')]
filtered_df.to_csv(os.path.join(outputDirectory2, "all_brains_read_count_matrix_filtered.csv"), index=False)

In [25]:
# combining all brains into one file

dfs = []

for filename in os.listdir(outputDirectory2):
    if filename.startswith("brain") and filename.endswith(".csv"):
        file_path = os.path.join(outputDirectory2, filename)

        # Extract sample number from filename
        sample_number = filename.split("_")[0].replace("brain", "")

        # Read the .csv file into a DataFrame
        df = pd.read_csv(file_path)

        # Add the sample column
        df['sample'] = sample_number

        # Append to the list of DataFrames
        dfs.append(df)

# Concatenate all DataFrames together
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new .csv file
combined_output_path_script = os.path.join(outputDirectory2, "all_brains_umi_count_matrix.csv")
combined_df.to_csv(combined_output_path_script, index=False)

# Display the first few rows of the combined data for verification
combined_df.head()

Unnamed: 0,cellID,barcode,sample
0,10x52_AAACCCAGTAGCCAGA,GTGTGCAGCTTTGAAGGGTGATGTGGGGGG,3
1,10x52_AAACCCAGTAGCCAGA,GTGTGCAGCTTTGAAGGGTGATGTGGGGGG,3
2,10x52_AAACCCAGTAGCCAGA,TTGGCGGAGCGGGGGGAGGGAGGGCGATAC,3
3,10x52_AAACGCTAGCATGATA,TATTGTGCCGCCCGAGAGTAGCGTGGGGGC,3
4,10x52_AAACGCTAGCATGATA,TATTGTGCCGCCCGAGAGTAGCGTGGGGGC,3


In [2]:
def format_umi_count_matrices(input_folder, output_folder):
    """
    Transform UMI count matrices from all CSV files in a given folder and save the transformed data to new CSV files in the output folder.

    Parameters:
    - input_folder: Path to the input folder containing CSV files.
    - output_folder: Path where the transformed CSV files will be saved.

    Returns:
    - None
    """

    # Ensure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".csv"):
            # Construct full paths for input and output files
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename.replace(".csv", "_formatted.csv"))

            # Load the CSV file into a DataFrame
            df = pd.read_csv(input_path, index_col=0)

            # Melt the DataFrame to reshape it
            melted_df = df.reset_index().melt(id_vars='index', var_name='cellID', value_name='count')

            # Rename columns and reorder them
            melted_df = melted_df.rename(columns={'index': 'barcode'})
            melted_df = melted_df[['cellID', 'barcode', 'count']]

            # Remove rows where count is 0
            filtered_df = melted_df[melted_df['count'] != 0].reset_index(drop=True)

            # Save the filtered data to the specified output path
            filtered_df.to_csv(output_path, index=False)

In [11]:
def append_csv_files(input_folder, output_path):
    """
    Append all .csv files in a given folder into one .csv file.

    Parameters:
    - input_folder: Path to the input folder containing the .csv files.
    - output_path: Path where the combined .csv file will be saved.

    Returns:
    - None
    """

    # List to hold individual DataFrames
    dfs = []

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith("_formatted.csv"):
            file_path = os.path.join(input_folder, filename)

            # Read the .csv file into a DataFrame and append to the list
            df = pd.read_csv(file_path)
            # Extract the sample name from the filename
            sample_name = filename.split('_')[3]
            df['sample'] = sample_name
            dfs.append(df)

    # Concatenate all DataFrames together
    combined_df = pd.concat(dfs, ignore_index=True)

    # Save the combined DataFrame to the specified output path
    combined_df.to_csv(output_path, index=False)

In [7]:
def expand_rows_based_on_count(input_folder, output_folder):
    """
    For each .csv file in the input folder ending with "_formatted.csv", duplicate rows based on the "count" value
    and then drop the "count" column. The processed data is saved to the specified output folder with the same filename.

    Parameters:
    - input_folder: Path to the folder containing the .csv files.
    - output_folder: Path to the folder where the processed files will be saved.

    Returns:
    - None
    """

    # Ensure the output directory exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate over each file in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith("_formatted.csv"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Read the .csv file into a DataFrame
            df = pd.read_csv(input_path)

            # Duplicate rows based on the "count" column
            df = df.loc[df.index.repeat(df['count'])]

            # Drop the "count" column
            df = df.drop(columns=['count']).reset_index(drop=True)

            # Save the processed DataFrame to the specified output path
            df.to_csv(output_path, index=False)


In [6]:
# mapping cellID for the UMI count file using the cellID in the read count file

read_count_matrix = pd.read_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/read_count_matrices/formatted/all_brains_read_count_matrix_filtered.csv")
umi_count_matrix = pd.read_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/umi_count/lineageBC_results_UMIonly_formatted.csv")

# Create a mapping from cellID to sample
cellID_to_sample_mapping = read_count_matrix[['cellID', 'sample']].drop_duplicates()

# Merge the mapping with the lineageBC_df dataframe
merged_df = umi_count_matrix.merge(cellID_to_sample_mapping, on='cellID', how='left')
merged_df.head()

# Filter merged_df to exclude rows where 'sample' is missing
merged_df = merged_df[merged_df['sample'].notnull()]


merged_df.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/umi_count/lineageBC_results_UMIonly_formatted_mapped.csv", index=False)

In [7]:
# checking that all cellIDs from umi_count_matrix are present in merged_df

# Get unique cellIDs in umi_count_matrix and merged_df
umi_cellIDs = set(umi_count_matrix['cellID'].unique())
merged_cellIDs = set(merged_df['cellID'].unique())

# Check if all cellIDs in umi_count_matrix are present in merged_df
if umi_cellIDs.issubset(merged_cellIDs):
    print("All cellIDs in umi_count_matrix are present in merged_df.")
else:
    missing_cellIDs = umi_cellIDs - merged_cellIDs
    print(f"There are {len(missing_cellIDs)} cellIDs in umi_count_matrix that are not present in merged_df.")


# Create a subset of merged_df where 'sample' is missing
missing_samples_df = merged_df[merged_df['sample'].isnull()]

# Count the number of unique cellIDs in this subset
num_unique_missing_samples = missing_samples_df['cellID'].nunique()


print(f"There are {num_unique_missing_samples} unique cellIDs without a sample.")


There are 158 cellIDs in umi_count_matrix that are not present in merged_df.
There are 0 unique cellIDs without a sample.


In [8]:
brain1_only = merged_df[merged_df['sample'] == 'brain1']
brain2_only = merged_df[merged_df['sample'] == 'brain2']

brain1_only.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/umi_count/brain1_only.csv", index=False)
brain2_only.to_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/ZhangMelzerEtAl/data/CJ/umi_count/brain2_only.csv", index=False)