In [1]:
import pandas as pd



## Data Preparation

In [5]:
# Load eukprot taxonomy with only necessary columns
taxonomy_cols = ['EukProt_ID', 'Name_to_Use', 'Taxogroup2_UniEuk', 'Genus_UniEuk']  # Adjust columns as necessary
eukprot_taxonomy = pd.read_table('../data/annotation/taxonomy_eukprot/EukProt_included_data_sets.v03.2021_11_22.txt', usecols=taxonomy_cols)
eukprot_taxonomy['Name_to_Use'] = eukprot_taxonomy['Name_to_Use'].str.replace('_', ' ')

# Load eukprot annotations for station 130
eukprot_annotation_cols = ['query_id', 'target_id', 'p_ident', 'alnlen', 'mismatch',
                           'gapopen', 'qstart', 'qend', 'tstart', 'tend', 'evalue', 'bits']
eukprot_annotation_130 = pd.read_table('../data/annotation/taxonomy_eukprot/130/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_130['query_id'] = eukprot_annotation_130['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_130['target_id'] = eukprot_annotation_130['target_id'].str.split("_", n=1, expand=True)[0]
print(f'The eukprot annotation file contains {len(eukprot_annotation_130)} rows')

# Merge annotation and taxonomy
eukprot_annotation_130 = eukprot_annotation_130.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
print(f'The merged annotation and taxonomy file contains {len(eukprot_annotation_130)} rows')

# Load eukprot annotations for station 51
eukprot_annotation_51 = pd.read_table('../data/annotation/taxonomy_eukprot/51/eukprot_annotation.m8',
                                   header=None, names=eukprot_annotation_cols)

# Fix transcript names efficiently
eukprot_annotation_51['query_id'] = eukprot_annotation_51['query_id'].str.split(".", n=1, expand=True)[0]
eukprot_annotation_51['target_id'] = eukprot_annotation_51['target_id'].str.split("_", n=1, expand=True)[0]
print(f'The eukprot annotation file contains {len(eukprot_annotation_51)} rows')

# Merge annotation and taxonomy
eukprot_annotation_51 = eukprot_annotation_51.merge(eukprot_taxonomy, left_on='target_id', right_on='EukProt_ID', how='left')
print(f'The merged annotation and taxonomy file contains {len(eukprot_annotation_51)} rows')

The eukprot annotation file contains 768760 rows
The merged annotation and taxonomy file contains 768760 rows
The eukprot annotation file contains 239749 rows
The merged annotation and taxonomy file contains 239749 rows


In [None]:
# Load tpm data
tpm = pd.read_csv('../../data/quantification/tpm.csv', engine='pyarrow')
tpm.rename(columns={'target_id': 'transcript_id'}, inplace=True)
tpm.set_index('transcript_id', inplace=True)

# Optional: Remove rows with row sums < 20
tpm = tpm[tpm.sum(axis=1) >= 20].reset_index()
tpm = tpm.melt(id_vars=['transcript_id'], var_name='sample', value_name='TPM')
tpm['sample'] = tpm['sample'].astype('category')
tpm['TPM'] = tpm['TPM'].astype('float32')