In [None]:
### Comparing NMD-regulator and COMPASS component expression vs. %paralog upregulation for Mellis et al. 2023
### Created by Madeline E Melzer on 20231121, Last edit by Madeline E Melzer on 20231122

In [1]:
import os
import pandas as pd
import numpy as np

In [13]:
deseq_files = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/grn_nitc/rnaseq/deseq_files/"
lengthDir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/grn_nitc/rnaseq/supp_analyses/length/data/"
outputDir = "/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/grn_nitc/rnaseq/supp_analyses/nitc_components/data/"

np.random.seed(23)

In [16]:
# these genes are 12 mechanistically-significant genes for the NITC pathway. The first 8 are COMPASS components, the last 4 are NMD regulators. Case-sensitive for species.
genes_of_interest = ["SETD1A", "SETD1B", "ASH2L", "RBBP5", "CXXC1", "WDR82", "WDR5", "DPY30", "UPF1", "UPF2", "UPF3A", "UPF3B", 
                     "Setd1a", "Setd1b", "Ash2l", "Rbbp5", "Cxxc1", "Wdr82", "Wdr5", "Dpy30", "Upf1", "Upf2", "Upf3a", "Upf3b"]

dfs = []

for folder in os.listdir(deseq_files):
    folder_path = os.path.join(deseq_files, folder)
    if os.path.isdir(folder_path):
        # Assuming there is only one .csv file per folder
        for file in os.listdir(folder_path):
            if file.endswith('.csv'):
                file_path = os.path.join(folder_path, file)
                df = pd.read_csv(file_path)
                filtered_df = df[df['gene_name'].isin(genes_of_interest)].copy()
                filtered_df['dataset'] = folder
                dfs.append(filtered_df)

#adding in GSE151825 which is too large of a file to be put on github
df = pd.read_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/GSE151825/differentialExpression_DESeq_allTargets.csv")
filtered_df = df[df['gene_name'].isin(genes_of_interest)].copy()
filtered_df['dataset'] = "GSE151825"
dfs.append(filtered_df)

#adding in GSE145653-1 which is too large of a file to be put on github
df = pd.read_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/GSE145653-1/differentialExpression_DESeq_allTargets.csv")
filtered_df = df[df['gene_name'].isin(genes_of_interest)].copy()
filtered_df['dataset'] = "GSE145653-1"
dfs.append(filtered_df)

final_df = pd.concat(dfs, ignore_index=True)

# Rename 'sampleKO' to 'ko_gene' and 'gene_name' to 'component_gene' for clarity
final_df = final_df.rename(columns={'sampleKO': 'ko_gene', 'gene_name': 'component_gene'})

  df = pd.read_csv("/Users/mem3579/Library/CloudStorage/OneDrive-NorthwesternUniversity/Arispe and Goyal Labs/transcriptionalCompensation/GSE151825/differentialExpression_DESeq_allTargets.csv")


In [17]:
# adding in the %paralog upregulation data

pctUpregs = pd.read_csv(os.path.join(lengthDir, 'combinedPctUpregs.csv'))

# Create a subset of paralogs DataFrame
pctUpregs_subset = pctUpregs[['Gene', 'Percent Upregulated', 'p_value', 'dataset']]
pctUpregs_subset = pctUpregs_subset.rename(columns={'Gene': 'ko_gene'})

# Perform an inner join based on "ko_gene" and "dataset" columns
components = pd.merge(final_df, pctUpregs_subset, how='left', on=['ko_gene', 'dataset'])

# Save the joined data to a new CSV file
combined_file_path = os.path.join(outputDir, 'mechanismComponents_baseMeans.csv')
components.to_csv(combined_file_path, index=False)