In [1]:
%load_ext autoreload
%autoreload 2
from bm_tools.utils import Utils

import os
import openpyxl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from bm_tools.data import Data
from bm_tools.stats import Stats

In [2]:
data_dir = "./data/datasets"

In [3]:
# load proteomics data and reshape to data matrix
data_path = "./data_Proteome/proteomics_filtered.txt"
df = pd.read_csv(data_path, sep="\t")

# index
sample_data = df.copy()
sample_data.set_index("ProteinGroups", inplace = True, drop = False)

# sample data
sample_data = sample_data.drop(['ProteinGroups', 'Genes', 'ProteinDescriptions'], axis = 1).T
sample_data.index = sample_data.index.str.replace('ifng', 'ifng_').str.replace('control', 'control_')

# feature metadata
feature_metadata = df[['ProteinGroups', 'Genes', 'ProteinDescriptions']].copy()
feature_metadata.set_index("ProteinGroups", inplace = True, drop = False)

# sample metadata
sample_metadata = pd.DataFrame({"sample" : sample_data.index})
sample_metadata.set_index("sample", inplace = True, drop = False)

print(f"Data shapes: sample_data : {sample_data.shape}, sample_metadata : {sample_metadata.shape}, feature_metadata : {feature_metadata.shape}")

# create Data instance to check alignment, apply filtering, track metadata etc. 
data = Data()
data.add_anndata(sample_data)
data.add_metadata(sample_metadata, axis = 0)
data.add_metadata(feature_metadata, axis = 1)

# save data
data.save_dataframes(
    directory = data_dir,
    name = "islets_proteomics",
    pickle = True
)

Data shapes: sample_data : (18, 7285), sample_metadata : (18, 1), feature_metadata : (7285, 3)


In [9]:
# load transcriptomics data and reshape to data matrix
# data_path = "./data_RNAseq/MARINE_ilots_ifng_expression_DeSeq2.xlsx" # old dataset
data_path = "./data_RNAseq/ilots_ifng_expression_DeSeq2_Marine_20220610.xlsx"
df = pd.read_excel(data_path, engine = 'openpyxl')
print(df.shape)
#TODO: figure out strange openpyxl/pandas warning for old dataset

# remove unused columns
mean_ratio_columns = [
    'alpha.ctrl',
    'alpha.IFNg',
    'alpha.log2fc',
    'alpha.p.value', 
    'alpha.p.adj',
    'beta.ctrl',
    'beta.IFNg',
    'beta.log2fc',
    'beta.p.value', 
    'beta.p.adj',
    'delta.ctrl',
    'delta.IFNg',
    'delta.log2fc', 
    'delta.p.value', 
    'delta.p.adj'
    ]
df.drop(mean_ratio_columns, axis = 1, inplace = True)

# # add uniprot id column to ENSEMBL id column
# mmus_ref_dict = Utils.map_ensembl_to_uniprot('mmusculus', True)

# # Save the mmus_ref_dict to a pickle file
# NOW = pd.Timestamp.now().strftime("%Y-%m-%d_%H_%M_%S")
# mmus_ref_dict_df = pd.DataFrame.from_dict(mmus_ref_dict, orient = 'index', columns = ['uniprot_id'])
# mmus_ref_dict_df.to_pickle(f"./{NOW}_mmus_ref_dict.pkl")

# load mmus_ref_dict from pickle file
mmus_ref_dict = pd.read_pickle("./2025-04-23_12_01_01_mmus_ref_dict.pkl")['uniprot_id'].to_dict()

df['uniprot_id'] = df['ensembl.id'].map(mmus_ref_dict)
df.set_index('uniprot_id', inplace = True, drop = False)

# run checks: remove rows with missing uniprot ids
df.dropna(subset = ['uniprot_id'], inplace = True)

# deduplicate by aggregation: either average or sum per uniprot id. String columns are concatenated with a semicolon
df = Utils.deduplicate_alphanumeric_dataframe(df, 'sum')

# index
sample_data = df.copy()
sample_data.set_index('uniprot_id', inplace = True, drop = False)

# sample data
sample_data = sample_data.drop(['ensembl.id', 'gene.symbol', 'uniprot_id'], axis = 1).T
sample_data.index = sample_data.index.str.lower()

# sample metadata
sample_metadata = pd.DataFrame({"sample" : sample_data.index})
sample_metadata.set_index("sample", inplace = True, drop = False)

# feature metadata
feature_metadata = df[['ensembl.id', 'gene.symbol', 'uniprot_id']].copy()
feature_metadata.set_index('uniprot_id', inplace = True, drop = False)

print(f"Data shapes: sample_data : {sample_data.shape}, sample_metadata : {sample_metadata.shape}, feature_metadata : {feature_metadata.shape}")

# create Data instance to check alignment, apply filtering, track metadata etc.
data = Data()
data.add_anndata(sample_data)
data.add_metadata(sample_metadata, axis = 0)
data.add_metadata(feature_metadata, axis = 1)

# apply log2 transformation to transcriptomics data
data.log_transform(log = 2)

# save data
data.save_dataframes(
    directory = data_dir,
    name = "islets_transcriptomics",
    pickle = True
)


(17230, 35)
Shape before deduplication: (13098, 21), shape after deduplication: (12993, 21), aggregated 105 rows.
Data shapes: sample_data : (18, 12993), sample_metadata : (18, 1), feature_metadata : (12993, 3)


In [5]:
# load phospho data and reshape to data matrix
data_path = "./data_Phospho/celltype_psite.txt"
df = pd.read_csv(data_path, sep="\t")

# index
sample_data = df.copy()
sample_data.set_index('PTM_collapse_key', inplace = True, drop = False)

# sample data
sample_data = sample_data[[
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_07_alpha_01_S3-G1_1_2886',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_08_alpha_02_S3-H1_1_2887',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_09_alpha_03_S3-A2_1_2888',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_10_beta_01_S3-B2_1_2889',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_11_beta_02_S3-C2_1_2890',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_12_beta_03_S3-D2_1_2891',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_13_delta_01_S3-E2_1_2883',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_14_delta_02_S3-F2_1_2884',
    '20230426_TIMS01scp_MCT_SA_whi20_AID12_DBS08_Phospho_15_delta_03_S3-G2_1_2885',]].T

# sample_metadata
sample_metadata = pd.DataFrame({"sample" : sample_data.index})
sample_metadata.set_index("sample", inplace = True, drop = False)
sample_metadata['treat'] = 'control'
sample_metadata['sample'] = sample_metadata.index.str.split('_').str[-5]
sample_metadata['rep'] = sample_metadata.index.str.split('_').str[-4].astype(int).astype(str)
sample_metadata['readout'] = 'phosphoproteomics'
sample_metadata['sample_treat_readout'] = sample_metadata['sample'] + '_' + sample_metadata['treat'] + '_' + sample_metadata['readout']
sample_metadata['sample_treat_rep_readout'] = sample_metadata['sample'] + '_' + sample_metadata['treat'] + '_' + sample_metadata['rep'] + '_' + sample_metadata['readout']

# replace data index with matched metadata sample_treat_rep_readout
sample_data = sample_data.join(sample_metadata['sample_treat_rep_readout'])
sample_data.set_index('sample_treat_rep_readout', inplace = True, drop = True)

# replace sample metadata index with sample_treat_rep_readout
sample_metadata['PTM_collapse_key'] = sample_metadata.index
sample_metadata.set_index('sample_treat_rep_readout', inplace = True, drop = False)

# feature metadata
feature_metadata = df[['R.Condition', 'PG.Genes', 'PG.Organisms', 'PG.ProteinDescriptions',
       'PG.ProteinGroups', 'PG.ProteinNames', 'PG.UniProtIds',
       'PEP.PeptidePosition', 'EG.IsDecoy', 'EG.PrecursorId',
       'EG.PTMPositions..Phospho..STY..', 'EG.ApexRT',
       'EG.PTMProbabilities..Phospho..STY..', 'EG.PTMSites..Phospho..STY..',
       'EG.PTMAssayCandidateScore', 'EG.PTMAssayProbability',
       'EG.PTMLocalizationProbabilities', 'EG.ProteinPTMLocations',
       'EG.NormalizationFactor', 'PTM_0_num', 'PTM_group', 'PTM_collapse_key',
       'PTM_collapse_key_num', 'PTM_localization', 'PTM_0_aa']].copy()
feature_metadata.set_index('PTM_collapse_key', inplace = True, drop = False)

print(f"Data shapes: sample_data : {sample_data.shape}, sample_metadata : {sample_metadata.shape}, feature_metadata : {feature_metadata.shape}")

# create Data instance to check alignment, apply filtering, track metadata etc.
data = Data()
data.add_anndata(sample_data)
data.add_metadata(sample_metadata, axis = 0)
data.add_metadata(feature_metadata, axis = 1)

# save data
data.save_dataframes(
    directory = data_dir,
    name = "islets_phospho",
    pickle = True
)

Data shapes: sample_data : (9, 7356), sample_metadata : (9, 7), feature_metadata : (7356, 25)


### Save usable datasets for proteomics and transcriptomics data

In [6]:
data_dir = "./data/datasets"
subframes = ['alpha_control', 'beta_control', 'delta_control', 'alpha_ifng', 'beta_ifng', 'delta_ifng']

# load proteomics data
pdata = pd.read_pickle(os.path.join(data_dir, 'islets_proteomics_data.pkl'))

# load transcriptomics data
tdata = pd.read_pickle(os.path.join(data_dir, "islets_transcriptomics_data.pkl"))

# experiment with aggregating the data and transporting it around with a Data object
pdata.index = [x + "_proteomics" for x in pdata.index]
tdata.index = [x + "_transcriptomics" for x in tdata.index]

# combine the data
data = pdata.T.join(tdata.T, how = 'inner').T
print(data.shape)

# split idx
sample_treat_rep_readout = data.index
sample = sample_treat_rep_readout.str.split("_").str[0]
treat = sample_treat_rep_readout.str.split("_").str[1]
rep = sample_treat_rep_readout.str.split("_").str[2]
readout = sample_treat_rep_readout.str.split("_").str[3]

# make metadata
md = pd.DataFrame({
    "sample": sample,
    "treat" : treat,
    "rep": rep,
    "readout": readout,
    "sample_treat_readout": ["_".join(x) for x in zip(sample, treat, readout)],
}, index = data.index)

# make a Data object
data_obj = Data()
data_obj.add_anndata(data)
data_obj.add_metadata(md, 0)

# save to disk
data_obj.get_data().to_pickle("./rna_prot_dataset.pkl")
data_obj.get_metadata(0).to_pickle("./rna_prot_sample_metadata.pkl")
data_obj.get_metadata(1).to_pickle("./rna_prot_feature_metadata.pkl")


(36, 4481)
