In [8]:
import pandas as pd
import os

# === Paths ===
# Input: The preprocessed metabolites file (Z-scored)
metabolites_path = '/home/pintokf/Projects/Microbium/Mouses/preprocess_metabolits/preprocessed_metabolites_normalized_z_score.csv'

# Input: The metadata file
metadata_path = "/home/pintokf/Projects/Microbium/Mouses/mouses_2_data/metadata.txt"

# Output: The merged file
output_path = "/home/pintokf/Projects/Microbium/Mouses/preprocess_metabolits/preprocessed_metabolites_metadata.csv"

print("--- Starting Merge Process ---")

# 1. Load Metabolites Data
try:
    df_metabolites = pd.read_csv(metabolites_path)
    print(f"Loaded Metabolites Data. Shape: {df_metabolites.shape}")
    
    # In the previous step, we saved the index as 'SampleID'. 
    # When reading back, it becomes a column named 'SampleID'.
    # We rename it to 'ID' for consistency.
    if 'SampleID' in df_metabolites.columns:
        df_metabolites.rename(columns={'SampleID': 'ID'}, inplace=True)
    elif 'ID' not in df_metabolites.columns:
         # Fallback: if checking the first column is the ID
         print(f"Warning: 'SampleID' column not found. Using first column as ID: {df_metabolites.columns[0]}")
         df_metabolites.rename(columns={df_metabolites.columns[0]: 'ID'}, inplace=True)
         
    print(f"Metabolites ID column verified.")

except Exception as e:
    print(f"Error loading metabolites data: {e}")
    exit(1)

# 2. Load Metadata
try:
    # Metadata is tab-separated
    df_meta = pd.read_csv(metadata_path, sep='\t')
    print(f"Loaded Metadata. Shape: {df_meta.shape}")
    
    # Standardize ID column name in Metadata
    if '#SampleID' in df_meta.columns:
        df_meta.rename(columns={'#SampleID': 'ID'}, inplace=True)
    elif 'SampleID' in df_meta.columns:
        df_meta.rename(columns={'SampleID': 'ID'}, inplace=True)
        
    # Verify ID exists
    if 'ID' not in df_meta.columns:
        raise ValueError("Could not find ID column in metadata (expected '#SampleID' or 'SampleID')")
        
except Exception as e:
    print(f"Error loading metadata: {e}")
    exit(1)

# 3. Merge Tables
# We use 'inner' merge to keep only samples that exist in BOTH files
merged_df = pd.merge(df_metabolites, df_meta, on='ID', how='inner')

# 4. Set ID as index for the final file (optional, but good practice)
merged_df.set_index('ID', inplace=True)

print(f"--- Merge Complete ---")
print(f"Original Metabolites Samples: {len(df_metabolites)}")
print(f"Original Metadata Samples: {len(df_meta)}")
print(f"Merged Samples (Intersection): {len(merged_df)}")

--- Starting Merge Process ---
Loaded Metabolites Data. Shape: (72, 1890)
Metabolites ID column verified.
Loaded Metadata. Shape: (72, 11)
--- Merge Complete ---
Original Metabolites Samples: 72
Original Metadata Samples: 72
Merged Samples (Intersection): 72


In [9]:
output_path = "/home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/merged_data_metabolites.csv"
merged_df.to_csv(output_path, index=True)
print(f"✅ Saved merged file to: {output_path}")

✅ Saved merged file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/merged_data_metabolites.csv


In [10]:
# === Split Data based on 'Death' column ===

# 1. Create the Uncensored group (Death = yes)
df_uncensored = merged_df[merged_df['Death'] == 'yes'].copy()

# 2. Create the Censored group (Death = no)
df_censored = merged_df[merged_df['Death'] == 'no'].copy()

# === Verification ===
print(f"Original Total: {len(merged_df)}")
print(f"Uncensored (Dead/Events): {len(df_uncensored)}")
print(f"Censored (Alive/No Event): {len(df_censored)}")

# === Save to files ===
output_dir = "/home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites"

path_uncensored = f"{output_dir}/metabolites_uncensored.csv"
path_censored = f"{output_dir}/metabolites_censored.csv"

df_uncensored.to_csv(path_uncensored, index=False)
df_censored.to_csv(path_censored, index=False)

print(f"\n✅ Files saved successfully:")
print(f"1. {path_uncensored}")
print(f"2. {path_censored}")

Original Total: 72
Uncensored (Dead/Events): 22
Censored (Alive/No Event): 50

✅ Files saved successfully:
1. /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/metabolites_uncensored.csv
2. /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/metabolites_censored.csv


In [11]:
import pandas as pd

def process_mouse_data(df_input):
    df = df_input.copy()
    
    # === CRITICAL FIX 1: Reset Index ===
    # Since 'ID' is currently the index, we must move it back to being a column.
    # This fixes the print error AND ensures 'ID' is saved to the CSV later.
    if df.index.name == 'ID':
        df.reset_index(inplace=True)
    
    # === CRITICAL FIX 2: Clean column names ===
    df.columns = df.columns.str.strip()
    
    # 1. Drop unnecessary columns
    cols_to_drop = ['barcode', 'LinkerPrimerSequence', 'ReversePrimer', 'Death']
    df = df.drop(columns=cols_to_drop, errors='ignore')
    
    # 2. Fix AgeMonths
    if 'AgeMonths' in df.columns:
        df['AgeMonths'] = df['AgeMonths'].astype(str).str.replace('_months', '', regex=False)
        df['AgeMonths'] = pd.to_numeric(df['AgeMonths'], errors='coerce')
    
    # 3. Fix SamplingDate
    if 'SamplingDate' in df.columns:
        try:
            def fix_date_format(val):
                val_str = str(val).strip()
                if '-' in val_str:
                    parts = val_str.split('-')
                    # parts[0] = "20" (Year), parts[1] = "May" (Month)
                    return f"01-{parts[1]}-20{parts[0]}"
                return val 

            df['SamplingDate'] = df['SamplingDate'].apply(fix_date_format)
            df['SamplingDate'] = pd.to_datetime(df['SamplingDate'], format='%d-%b-%Y')
            
        except Exception as e:
            print(f"⚠️ Warning: Date conversion failed: {e}")

    return df

# === Execution ===

print("--- Processing Uncensored (Dead) Data ---")
df_uncensored_clean = process_mouse_data(df_uncensored)

# Now this will work because 'ID' is a regular column again
print("Columns remaining:", list(df_uncensored_clean.columns))
print("Date Preview:")
print(df_uncensored_clean[['ID', 'SamplingDate']].head())

print("\n--- Processing Censored (Alive) Data ---")
df_censored_clean = process_mouse_data(df_censored)
print("Columns remaining:", list(df_censored_clean.columns))

# === Save Files ===
# Since ID is now a column, index=False is CORRECT (we don't want a generic numeric index)
df_uncensored_clean.to_csv(path_uncensored, index=False)
df_censored_clean.to_csv(path_censored, index=False)

print(f"\n✅ Files saved successfully.")

--- Processing Uncensored (Dead) Data ---
Columns remaining: ['ID', '303.2925@9.854003', 'LysoPC(18:0)', '1-Palmitoyllysophosphatidylcholine', '102.0467@1.030999', '331.3236@10.99799', '378.2891@7.2789965', '2-Hydroxy-4-imino-2,5-cyclohexadienone', 'Mesobilirubinogen', '816.575@8.012004', '1-Linoleoylglycerophosphocholine', '664.5249@17.176014', 'Picolinic acid', '784.5861@9.4509945', '(4Z,7Z,10Z,13Z,16Z,19Z)-Docosahexaenoic acid ethyl ester', '3-Hydroxy-19-norpregna-1,3,5(10)-trien-20-one', '5-(3-Pyridyl)-2-hydroxytetrahydrofuran', '3?,12?-Dihydroxy-5?-chol-8(14)-en-24-oic Acid', '3beta-hydroxy-9beta-pimara-7,15-dien-19,6beta-olide', 'Linoleoyl Ethanolamide', 'Bufadienolide', 'Merphalan', 'Stigmatellin Y', '11-Oxo-beta-amyrin', 'Bufadienolide Esi+7.161006', "5'-Methylthioadenosine", '540.3309@4.365998', '257.1991@4.2709966', 'Choline', 'Stearoylethanolamide', 'Adenine', '(3b,4b,11b,14b)-11-Ethoxy-3,4-epoxy-14-hydroxy-12-cyathen-15-al 14-xyloside', 'LysoPC(15:0)', '692.5561@13.096989',

In [12]:
import pandas as pd

# === Logic to fix DeathDate specific format ===
# Input examples: "05_21_b", "06_21"
# Output: 01-05-2021 (datetime object)
def fix_death_date_format(val):
    val_str = str(val).strip()
    
    if val_str == 'nan' or val_str == '':
        return pd.NaT
    
    try:
        # Split by underscore '_'
        parts = val_str.split('_')
        
        # We need at least the first two parts (Month and Year)
        if len(parts) >= 2:
            month = parts[0]      # e.g., "05"
            year_short = parts[1] # e.g., "21"
            
            # Construct the string "01-MM-20YY"
            date_str = f"01-{month}-20{year_short}"
            
            # Convert to datetime
            return pd.to_datetime(date_str, format='%d-%m-%Y')
        else:
            return pd.NaT
            
    except Exception as e:
        print(f"⚠️ Error parsing date: {val} -> {e}")
        return pd.NaT

# === Main Processing for Uncensored Data ===
print("--- Processing Uncensored Data (Dates & Diff) ---")

# Ensure the dataframe exists
if 'df_uncensored_clean' in locals():
    
    # 1. Apply DeathDate Fix
    if 'DeathDate' in df_uncensored_clean.columns:
        df_uncensored_clean['DeathDate'] = df_uncensored_clean['DeathDate'].apply(fix_death_date_format)

    # 2. Calculate 'diff' Column
    # Formula: (DeathAgeMonths - AgeMonths) * 30
    if 'DeathAgeMonths' in df_uncensored_clean.columns and 'AgeMonths' in df_uncensored_clean.columns:
        
        # Ensure columns are numeric
        df_uncensored_clean['DeathAgeMonths'] = pd.to_numeric(df_uncensored_clean['DeathAgeMonths'], errors='coerce')
        df_uncensored_clean['AgeMonths'] = pd.to_numeric(df_uncensored_clean['AgeMonths'], errors='coerce')
        
        # Perform calculation
        df_uncensored_clean['diff'] = (df_uncensored_clean['DeathAgeMonths'] - df_uncensored_clean['AgeMonths']) * 30
        
        print("✅ Added 'diff' column.")
        # Preview the new column
        print(df_uncensored_clean[['ID', 'AgeMonths', 'DeathAgeMonths', 'diff']].head())
    else:
        print("❌ Error: Missing 'DeathAgeMonths' or 'AgeMonths' columns.")

    # 3. Save to file
    df_uncensored_clean = df_uncensored_clean.drop('DeathAgeMonths', axis=1)
    df_uncensored_clean.to_csv(path_uncensored, index=False)
    print(f"✅ Saved updated file to: {path_uncensored}")

else:
    print("❌ Error: df_uncensored_clean is not defined. Please run previous steps.")

--- Processing Uncensored Data (Dates & Diff) ---
✅ Added 'diff' column.
          ID  AgeMonths  DeathAgeMonths  diff
0  20-1_5-20          4              12   240
1  21-1_5-20          4              15   330
2  24-0_5-20          4              17   390
3  24-2_5-20          4              17   390
4  29-2_5-20          4              15   330
✅ Saved updated file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/metabolites_uncensored.csv


In [13]:
import pandas as pd
from pandas.tseries.offsets import DateOffset

print("--- Processing Censored Data (Columns & Dates) ---")

# Ensure the dataframe exists
if 'df_censored_clean' in locals():
    
    # 1. Drop irrelevant columns for censored data
    cols_to_drop = ['DeathDate', 'DeathAgeMonths']
    df_censored_clean = df_censored_clean.drop(columns=cols_to_drop, errors='ignore')
    
    # Ensure numeric types for calculation
    if 'AgeMonths' in df_censored_clean.columns:
        df_censored_clean['AgeMonths'] = pd.to_numeric(df_censored_clean['AgeMonths'], errors='coerce')
        
        # 2. Calculate 'diff' column
        # Formula: (18 - AgeMonths) * 30
        df_censored_clean['diff'] = (18 - df_censored_clean['AgeMonths']) * 30
        
        # 3. Calculate 'DateEnd' column
        # Logic: SamplingDate + (18 - AgeMonths) months
        if 'SamplingDate' in df_censored_clean.columns:
            # Ensure SamplingDate is datetime
            df_censored_clean['SamplingDate'] = pd.to_datetime(df_censored_clean['SamplingDate'])
            
            # Define a helper function to add months per row
            def add_months_to_reach_18(row):
                try:
                    months_to_add = int(18 - row['AgeMonths'])
                    return row['SamplingDate'] + DateOffset(months=months_to_add)
                except Exception as e:
                    return pd.NaT

            # Apply calculation
            df_censored_clean['DateEnd'] = df_censored_clean.apply(add_months_to_reach_18, axis=1)
            
            # Ensure final format is datetime
            df_censored_clean['DateEnd'] = pd.to_datetime(df_censored_clean['DateEnd'])
            
            print("✅ Added 'diff' and 'DateEnd' columns.")
            print(df_censored_clean[['ID', 'AgeMonths', 'SamplingDate', 'DateEnd', 'diff']].head())
            
        else:
            print("❌ Error: 'SamplingDate' column missing.")
    else:
        print("❌ Error: 'AgeMonths' column missing.")

    # 4. Save to file
    df_censored_clean.to_csv(path_censored, index=False)
    print(f"✅ Saved updated censored file to: {path_censored}")

else:
    print("❌ Error: df_censored_clean is not defined.")

--- Processing Censored Data (Columns & Dates) ---
✅ Added 'diff' and 'DateEnd' columns.
          ID  AgeMonths SamplingDate    DateEnd  diff
0  18-1_5-20          4   2020-05-01 2021-07-01   420
1  18-2_5-20          4   2020-05-01 2021-07-01   420
2  20-0_5-20          4   2020-05-01 2021-07-01   420
3  21-2_5-20          4   2020-05-01 2021-07-01   420
4  22-0_5-20          4   2020-05-01 2021-07-01   420
✅ Saved updated censored file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/preprocces_ratio_metabolites/metabolites_censored.csv


In [14]:
df_uncensored_clean.rename(columns={'SamplingDate': 'Date'}, inplace=True)
df_censored_clean.rename(columns={'SamplingDate': 'Date'}, inplace=True)
df_uncensored_clean.to_csv(path_uncensored, index=False)
df_censored_clean.to_csv(path_censored, index=False)