# Housekeeping

## Library imports

In [7]:
import pandas as pd

import warnings

## Settings

In [8]:
warnings.filterwarnings("ignore")

## Data imports
Data was manually edited, to convert the mpa411.txt TSV format to a CSV format. Otherwise, Pandas was loading it as a single column, somehow. The first row, containing only "#mpa_vJun23_CHOCOPhlAnSGB_202403" was removed.

In [47]:
data = pd.read_csv('../data/raw/MAI3004_lucki_mpa411.csv')
metadata = pd.read_csv('../data/raw/MAI3004_lucki_metadata_safe.csv')
print(f"Data successfully imported. \n shape of data: {data.shape} \n Shape of metadata: {metadata.shape}")

assert data.shape == (6903, 932), "Data has the wrong shape. Check the CSV formatting."
assert metadata.shape == (930, 6), "Metadata has the wrong shape. Check the CSV formatting."

Data successfully imported. 
 shape of data: (6903, 932) 
 Shape of metadata: (930, 6)


## Function definitions
| Function Name | Description | Parameters |
|---------------|-------------|------------|


# Data preprocessing

## Merge data and metadata

In [None]:
sample_cols = [col for col in data.columns if col.startswith("mpa411_")]

sample_abundances = (
    data[['clade_name'] + sample_cols]
    .set_index('clade_name')
    .transpose()
    .rename_axis('original_sample_id')
    .reset_index()
    .rename(columns={'original_sample_id': 'sample_id'})
)

sample_abundances['sample_id'] = sample_abundances['sample_id'].str.removeprefix('mpa411_')

metadata_common = metadata[metadata['sample_id'].isin(sample_abundances['sample_id'])].copy()
merged_samples = metadata_common.merge(sample_abundances, on='sample_id', how='inner')

print(f"Metadata rows (original): {metadata.shape[0]}")
print(f"Metadata rows with matching samples: {metadata_common.shape[0]}")
print(f"Merged dataframe shape: {merged_samples.shape}")

## Missing check

## Outlier check