In [1]:
import pandas as pd

def preprocess_metabolites(file_path):
    """
    Reads the metabolites data, sets the 'Compound' column as the index,
    and transposes the DataFrame so that Sample IDs become rows.
    """
    # Read the file as a CSV. Since the file is .txt with tabs, we use sep='\t'.
    df = pd.read_csv(file_path, sep='\t')
    
    # Set 'Compound' as the index. 
    # This ensures that when we transpose, the compound names become the column headers.
    df.set_index('Compound', inplace=True)
    
    # Transpose the DataFrame (swap rows and columns).
    # Now, the index will contain the Sample IDs.
    df_transposed = df.T
    
    # Optional: If you want to convert the index (Sample IDs) back into a regular column:
    # df_transposed.reset_index(inplace=True)
    # df_transposed.rename(columns={'index': 'Sample_ID'}, inplace=True)
    
    return df_transposed

# Example usage:
# Assuming the file is in the current directory
df = preprocess_metabolites('/home/pintokf/Projects/Microbium/Mouses/mouses_2_data/meatabolites.txt')

# Display the first 5 rows to verify the transposition
print(df.head())

Compound   303.2925@9.854003  LysoPC(18:0)  \
18-1_5-20       9.201675e+05  2.482322e+07   
18-2_5-20       4.924285e+07  1.985832e+07   
20-0_5-20       5.563117e+07  1.198559e+07   
20-1_5-20       6.809295e+05  1.270308e+07   
21-1_5-20       7.940316e+06  5.943284e+06   

Compound   1-Palmitoyllysophosphatidylcholine  102.0467@1.030999  \
18-1_5-20                         19210210.61       1.281915e+07   
18-2_5-20                         21473648.52       1.972746e+07   
20-0_5-20                         18643611.25       9.666800e+06   
20-1_5-20                         16540001.16       4.491578e+06   
21-1_5-20                         14601952.94       8.293789e+06   

Compound   331.3236@10.99799  378.2891@7.2789965  \
18-1_5-20       1.321896e+06        6.886311e+06   
18-2_5-20       2.223548e+07        1.014112e+07   
20-0_5-20       2.757008e+07        7.622998e+06   
20-1_5-20       1.161169e+06        6.392904e+06   
21-1_5-20       5.461554e+06        8.205137e+06   

C

In [2]:
import numpy as np

# 1. Relative Normalization
# Sum each row (calculate the total abundance per sample)
row_sums = df.sum(axis=1)

# Divide each value by its corresponding row sum
df_normalized = df.div(row_sums, axis=0)

# 2. Log Transformation with Epsilon
# Define a small epsilon to avoid log(0) errors
epsilon = 1e-10

# Add epsilon and apply natural logarithm (log base e)
df_log = np.log(df_normalized + epsilon)

# Display the first 5 rows of the transformed data
print("Transformed data (Log of Relative Abundance):")
print(df_log.head())

# Optional: Verify that rows summed to 1 before the log (should be very close to 1.0)
print(df_normalized.sum(axis=1).head())

Transformed data (Log of Relative Abundance):
Compound   303.2925@9.854003  LysoPC(18:0)  \
18-1_5-20          -7.787398     -4.492419   
18-2_5-20          -3.996977     -4.905119   
20-0_5-20          -4.007231     -5.542270   
20-1_5-20          -8.150380     -5.224239   
21-1_5-20          -5.721256     -6.010948   

Compound   1-Palmitoyllysophosphatidylcholine  102.0467@1.030999  \
18-1_5-20                           -4.748757          -5.153259   
18-2_5-20                           -4.826915          -4.911730   
20-0_5-20                           -5.100471          -5.757277   
20-1_5-20                           -4.960302          -6.263879   
21-1_5-20                           -5.112054          -5.677703   

Compound   331.3236@10.99799  378.2891@7.2789965  \
18-1_5-20          -7.425132           -5.774663   
18-2_5-20          -4.792053           -5.577143   
20-0_5-20          -4.709243           -5.994805   
20-1_5-20          -7.616656           -5.910895   
21-1_5-2

In [3]:
# Apply Z-score normalization to each metabolite (column)
# Formula: z = (x - mean) / std
# This ensures all metabolites are on the same scale (mean=0, std=1)

# Calculate Z-score using pandas vectorization
# Note: axis=0 is the default, which calculates statistics per column
df_zscore = (df_log - df_log.mean()) / df_log.std()

# Display the first 5 rows of the Z-scored data
print("Data after Z-score normalization:")
print(df_zscore.head())

# Optional verification: 
# The mean of columns should be practically 0 and std should be 1
print("\nVerification (Means should be ~0, Stds should be ~1):")
print(df_zscore.mean().head())
print(df_zscore.std().head())

Data after Z-score normalization:
Compound   303.2925@9.854003  LysoPC(18:0)  \
18-1_5-20          -0.217534      2.405604   
18-2_5-20           0.873443      1.628247   
20-0_5-20           0.870492      0.428116   
20-1_5-20          -0.322009      1.027155   
21-1_5-20           0.377153     -0.454681   

Compound   1-Palmitoyllysophosphatidylcholine  102.0467@1.030999  \
18-1_5-20                            1.264986           0.747120   
18-2_5-20                            1.083734           1.149494   
20-0_5-20                            0.449349          -0.259143   
20-1_5-20                            0.774406          -1.103117   
21-1_5-20                            0.422486          -0.126576   

Compound   331.3236@10.99799  378.2891@7.2789965  \
18-1_5-20          -0.080480            0.487793   
18-2_5-20           0.623764            1.145846   
20-0_5-20           0.645913           -0.245624   
20-1_5-20          -0.131705            0.033927   
21-1_5-20           

In [10]:
# 1. Rename the Index just to be clear (it's currently named 'Compound')
df_zscore.index.name = 'SampleID'

# 2. Check the head again - now you will see 'SampleID' on the top left
print(df_zscore.head())

df_zscore.to_csv('/home/pintokf/Projects/Microbium/Mouses/preprocess_metabolits/preprocessed_metabolites_normalized_z_score.csv', index=True)

Compound   303.2925@9.854003  LysoPC(18:0)  \
SampleID                                     
18-1_5-20          -0.217534      2.405604   
18-2_5-20           0.873443      1.628247   
20-0_5-20           0.870492      0.428116   
20-1_5-20          -0.322009      1.027155   
21-1_5-20           0.377153     -0.454681   

Compound   1-Palmitoyllysophosphatidylcholine  102.0467@1.030999  \
SampleID                                                           
18-1_5-20                            1.264986           0.747120   
18-2_5-20                            1.083734           1.149494   
20-0_5-20                            0.449349          -0.259143   
20-1_5-20                            0.774406          -1.103117   
21-1_5-20                            0.422486          -0.126576   

Compound   331.3236@10.99799  378.2891@7.2789965  \
SampleID                                           
18-1_5-20          -0.080480            0.487793   
18-2_5-20           0.623764            1.14

In [20]:
df_zscore.shape

(72, 1889)

In [22]:
"ID" in df_zscore.columns or "Sample" in df_zscore.columns

False