In [7]:
import pandas as pd
import os

# === Paths ===
# Input 1: The Z-score test file from LOCATE
z_test_path = '/home/pintokf/Projects/Microbium/Mouses/Locate_model/Whole_data/locate_Z_test_level_6.csv'
# Input 2: The Z-score train file from LOCATE
z_train_path = '/home/pintokf/Projects/Microbium/Mouses/Locate_model/Whole_data/locate_Z_train_level_6.csv'

# Input 3: The metadata file
metadata_path = "/home/pintokf/Projects/Microbium/Mouses/mouses_2_data/metadata_ok173_time_series_all.txt"

# Output: The final merged file containing Z-features and Metadata
output_path = "/home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/merged_Z_level6_metadata.csv"

print("--- Starting Merge Process ---")

# 1. Load and Combine Z Data (Train + Test)
try:
    print(f"Loading Z-Train from: {z_train_path}")
    df_train = pd.read_csv(z_train_path)

    print(f"Loading Z-Test from: {z_test_path}")
    df_test = pd.read_csv(z_test_path)

    # Concatenate them (stacking rows) since columns are identical
    df_z_all = pd.concat([df_train, df_test], ignore_index=True)
    print(f"Combined Z Data Shape: {df_z_all.shape}")

    # Verify ID column name in the Z data
    if 'ID' in df_z_all.columns:
        print("ID column verified in Z data.")
    elif 'SampleID' in df_z_all.columns:
        print("Renaming 'SampleID' to 'ID' in Z data.")
        df_z_all.rename(columns={'SampleID': 'ID'}, inplace=True)
    else:
        # Fallback: assuming first column is ID if not named explicitly
        print(f"Warning: 'ID' column not found. Renaming first column '{df_z_all.columns[0]}' to 'ID'.")
        df_z_all.rename(columns={df_z_all.columns[0]: 'ID'}, inplace=True)

except Exception as e:
    print(f"Error loading Z data: {e}")
    exit(1)

# 2. Load Metadata
try:
    print(f"Loading Metadata from: {metadata_path}")
    # Metadata is usually tab-separated
    df_meta = pd.read_csv(metadata_path, sep='\t')
    print(f"Loaded Metadata. Shape: {df_meta.shape}")

    # Standardize ID column name in Metadata
    if '#SampleID' in df_meta.columns:
        df_meta.rename(columns={'#SampleID': 'ID'}, inplace=True)
    elif 'SampleID' in df_meta.columns:
        df_meta.rename(columns={'SampleID': 'ID'}, inplace=True)

    # Verify ID exists
    if 'ID' not in df_meta.columns:
        raise ValueError("Could not find ID column in metadata (expected '#SampleID' or 'SampleID')")

except Exception as e:
    print(f"Error loading metadata: {e}")
    exit(1)

# 3. Merge Tables
# We use 'inner' merge to keep only samples that exist in BOTH (Z-features and Metadata)
print("Merging Z features with Metadata...")
merged_df = pd.merge(df_z_all, df_meta, on='ID', how='inner')

# 4. Save
merged_df.to_csv(output_path, index=False)

print(f"--- Merge Complete ---")
print(f"Combined Z Samples: {len(df_z_all)}")
print(f"Metadata Samples: {len(df_meta)}")
print(f"Final Merged Samples (Intersection): {len(merged_df)}")
print(f"✅ Saved merged file to: {output_path}")

--- Starting Merge Process ---
Loading Z-Train from: /home/pintokf/Projects/Microbium/Mouses/Locate_model/Whole_data/locate_Z_train_level_6.csv
Loading Z-Test from: /home/pintokf/Projects/Microbium/Mouses/Locate_model/Whole_data/locate_Z_test_level_6.csv
Combined Z Data Shape: (72, 11)
ID column verified in Z data.
Loading Metadata from: /home/pintokf/Projects/Microbium/Mouses/mouses_2_data/metadata_ok173_time_series_all.txt
Loaded Metadata. Shape: (170, 18)
Merging Z features with Metadata...
--- Merge Complete ---
Combined Z Samples: 72
Metadata Samples: 170
Final Merged Samples (Intersection): 72
✅ Saved merged file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/merged_Z_level6_metadata.csv


In [8]:
# === Split Data based on 'Death' column ===

# 1. Create the Uncensored group (Death = yes)
df_uncensored = merged_df[merged_df['death'] == 'yes'].copy()

# 2. Create the Censored group (Death = no)
df_censored = merged_df[merged_df['death'] == 'no'].copy()

# === Verification ===
print(f"Original Total: {len(merged_df)}")
print(f"Uncensored (Dead/Events): {len(df_uncensored)}")
print(f"Censored (Alive/No Event): {len(df_censored)}")

# === Save to files ===
output_dir = "/home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate"

path_uncensored = f"{output_dir}/locate_uncensored_level_6.csv"
path_censored = f"{output_dir}/locate_censored_level_6.csv"

df_uncensored.to_csv(path_uncensored, index=False)
df_censored.to_csv(path_censored, index=False)

print(f"\n✅ Files saved successfully:")
print(f"1. {path_uncensored}")
print(f"2. {path_censored}")

Original Total: 72
Uncensored (Dead/Events): 22
Censored (Alive/No Event): 50

✅ Files saved successfully:
1. /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/locate_uncensored_level_6.csv
2. /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/locate_censored_level_6.csv


In [9]:
import pandas as pd
import numpy as np

def process_mouse_data(df_input):
    df = df_input.copy()
    
    # === 1. Standardize Column Names ===
    df.columns = df.columns.str.strip()
    
    # Rename 'age (weeks)' to 'AgeMonths'
    if 'age (weeks)' in df.columns:
        df.rename(columns={'age (weeks)': 'AgeMonths'}, inplace=True)
        
    # Rename 'date_month' to 'SamplingDate'
    if 'date_month' in df.columns:
        df.rename(columns={'date_month': 'SamplingDate'}, inplace=True)

    # === 2. Extract 'Cage' and 'MiceName' from 'mice_name' ===
    if 'mice_name' in df.columns:
        # Create 'Cage' column
        df['Cage'] = df['mice_name'].apply(lambda x: str(x).split('-')[0] if '-' in str(x) else str(x))
        
        # Create 'MiceName' column
        df['MiceName'] = df['mice_name'].astype(str).str.replace('Agf', '', case=False, regex=False)
        df['MiceName'] = df['MiceName'].str.replace('-m', '-', regex=False)
        
        print("✅ Extracted 'Cage' and 'MiceName' from 'mice_name'")

    # === 3. Convert Weeks to Months (Using CEIL / Round UP) ===
    # Logic: 
    # 7 weeks / 4 = 1.75 -> ceil -> 2
    # 15 weeks / 4 = 3.75 -> ceil -> 4
    # 23 weeks / 4 = 5.75 -> ceil -> 6
    if 'AgeMonths' in df.columns:
        df['AgeMonths'] = pd.to_numeric(df['AgeMonths'], errors='coerce')
        df['AgeMonths'] = np.ceil(df['AgeMonths'] / 4).astype('Int64')

    # === 4. Fix SamplingDate Format ===
    if 'SamplingDate' in df.columns:
        def fix_date_format(val):
            val_str = str(val).strip()
            if '_' in val_str:
                parts = val_str.split('_')
                if len(parts) == 2:
                    return f"01-{parts[0]}-20{parts[1]}"
            return val 

        df['SamplingDate'] = df['SamplingDate'].apply(fix_date_format)
        df['SamplingDate'] = pd.to_datetime(df['SamplingDate'], format='%d-%m-%Y', errors='coerce')

    # === 5. Drop unnecessary columns ===
    cols_to_drop = ['barcode', 'Plate', 'LinkerPrimerSequence', 'ReversePrimer', 
                    'death', 'WellPosition', 'metabolomics', 'date', 'number', 
                    'ignore_kit', 'group', 'death_age_week',
                    'mice_name'] 
    
    df = df.drop(columns=cols_to_drop, errors='ignore')

    return df

# === Execution ===

print("--- Processing Uncensored (Dead) Data ---")
if 'df_uncensored' in locals():
    df_uncensored_clean = process_mouse_data(df_uncensored)
    print("Preview (should be 2, 4, 6):")
    print(df_uncensored_clean[['ID', 'AgeMonths']].head())

print("\n--- Processing Censored (Alive) Data ---")
if 'df_censored' in locals():
    df_censored_clean = process_mouse_data(df_censored)
    print("Preview (should be 2, 4, 6):")
    print(df_censored_clean[['ID', 'AgeMonths']].head())

# === Save Files ===
if 'path_uncensored' in locals() and 'path_censored' in locals():
    df_uncensored_clean.to_csv(path_uncensored, index=False)
    df_censored_clean.to_csv(path_censored, index=False)
    print(f"\n✅ Files saved successfully.")

--- Processing Uncensored (Dead) Data ---
✅ Extracted 'Cage' and 'MiceName' from 'mice_name'
Preview (should be 2, 4, 6):
           ID  AgeMonths
4   34-0_7-20          4
8   36-2_7-20          4
10  20-1_5-20          4
15  47-1_7-20          4
17  41-1_7-20          4

--- Processing Censored (Alive) Data ---
✅ Extracted 'Cage' and 'MiceName' from 'mice_name'
Preview (should be 2, 4, 6):
          ID  AgeMonths
0  29-1_5-20          4
1  41-0_7-20          4
2  38-2_7-20          4
3  32-1_7-20          4
5  42-1_5-20          2

✅ Files saved successfully.


In [10]:
import pandas as pd

# === Logic to fix DeathDate specific format ===
# Input examples: "05_21_b", "06_21"
# Output: 01-05-2021 (datetime object)
def fix_death_date_format(val):
    val_str = str(val).strip()
    
    if val_str == 'nan' or val_str == '':
        return pd.NaT
    
    try:
        # Split by underscore '_'
        parts = val_str.split('_')
        
        # We need at least the first two parts (Month and Year)
        if len(parts) >= 2:
            month = parts[0]      # e.g., "05"
            year_short = parts[1] # e.g., "21"
            
            # Construct the string "01-MM-20YY"
            date_str = f"01-{month}-20{year_short}"
            
            # Convert to datetime
            return pd.to_datetime(date_str, format='%d-%m-%Y')
        else:
            return pd.NaT
            
    except Exception as e:
        print(f"⚠️ Error parsing date: {val} -> {e}")
        return pd.NaT

# === Main Processing for Uncensored Data ===
print("--- Processing Uncensored Data (Dates & Diff) ---")

# Ensure the dataframe exists
if 'df_uncensored_clean' in locals():
    
    # 1. Apply DeathDate Fix
    if 'death_date' in df_uncensored_clean.columns:
        df_uncensored_clean['death_date'] = df_uncensored_clean['death_date'].apply(fix_death_date_format)

    # 2. Calculate 'diff' Column
    # Formula: (DeathAgeMonths - AgeMonths) * 30
    if 'death_age_month' in df_uncensored_clean.columns and 'AgeMonths' in df_uncensored_clean.columns:
        
        # Ensure columns are numeric
        df_uncensored_clean['death_age_month'] = pd.to_numeric(df_uncensored_clean['death_age_month'], errors='coerce')
        df_uncensored_clean['AgeMonths'] = pd.to_numeric(df_uncensored_clean['AgeMonths'], errors='coerce')
        
        # Perform calculation
        df_uncensored_clean['diff'] = (df_uncensored_clean['death_age_month'] - df_uncensored_clean['AgeMonths']) * 30
        
        print("✅ Added 'diff' column.")
        # Preview the new column
        print(df_uncensored_clean[['ID', 'AgeMonths', 'death_age_month', 'diff']].head())
    else:
        print("❌ Error: Missing 'death_age_month' or 'AgeMonths' columns.")

    # 3. Save to file
    df_uncensored_clean = df_uncensored_clean.drop('death_age_month', axis=1)
    df_uncensored_clean.to_csv(path_uncensored, index=False)
    print(f"✅ Saved updated file to: {path_uncensored}")

else:
    print("❌ Error: df_uncensored_clean is not defined. Please run previous steps.")

--- Processing Uncensored Data (Dates & Diff) ---
✅ Added 'diff' column.
           ID  AgeMonths  death_age_month  diff
4   34-0_7-20          4               13   270
8   36-2_7-20          4               17   390
10  20-1_5-20          4               12   240
15  47-1_7-20          4               13   270
17  41-1_7-20          4               16   360
✅ Saved updated file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/locate_uncensored_level_6.csv


In [11]:
import pandas as pd
from pandas.tseries.offsets import DateOffset

print("--- Processing Censored Data (Columns & Dates) ---")

# Ensure the dataframe exists
if 'df_censored_clean' in locals():
    
    # 1. Drop irrelevant columns for censored data
    cols_to_drop = ['death_date', 'death_age_month']
    df_censored_clean = df_censored_clean.drop(columns=cols_to_drop, errors='ignore')
    
    # Ensure numeric types for calculation
    if 'AgeMonths' in df_censored_clean.columns:
        df_censored_clean['AgeMonths'] = pd.to_numeric(df_censored_clean['AgeMonths'], errors='coerce')
        
        # 2. Calculate 'diff' column
        # Formula: (18 - AgeMonths) * 30
        df_censored_clean['diff'] = (18 - df_censored_clean['AgeMonths']) * 30
        
        # 3. Calculate 'DateEnd' column
        # Logic: SamplingDate + (18 - AgeMonths) months
        if 'SamplingDate' in df_censored_clean.columns:
            # Ensure SamplingDate is datetime
            df_censored_clean['SamplingDate'] = pd.to_datetime(df_censored_clean['SamplingDate'])
            
            # Define a helper function to add months per row
            def add_months_to_reach_18(row):
                try:
                    months_to_add = int(18 - row['AgeMonths'])
                    return row['SamplingDate'] + DateOffset(months=months_to_add)
                except Exception as e:
                    return pd.NaT

            # Apply calculation
            df_censored_clean['DateEnd'] = df_censored_clean.apply(add_months_to_reach_18, axis=1)
            
            # Ensure final format is datetime
            df_censored_clean['DateEnd'] = pd.to_datetime(df_censored_clean['DateEnd'])
            
            print("✅ Added 'diff' and 'DateEnd' columns.")
            print(df_censored_clean[['ID', 'AgeMonths', 'SamplingDate', 'DateEnd', 'diff']].head())
            
        else:
            print("❌ Error: 'SamplingDate' column missing.")
    else:
        print("❌ Error: 'AgeMonths' column missing.")

    # 4. Save to file
    df_censored_clean.to_csv(path_censored, index=False)
    print(f"✅ Saved updated censored file to: {path_censored}")

else:
    print("❌ Error: df_censored_clean is not defined.")

--- Processing Censored Data (Columns & Dates) ---
✅ Added 'diff' and 'DateEnd' columns.
          ID  AgeMonths SamplingDate    DateEnd  diff
0  29-1_5-20          4   2020-05-01 2021-07-01   420
1  41-0_7-20          4   2020-07-01 2021-09-01   420
2  38-2_7-20          4   2020-07-01 2021-09-01   420
3  32-1_7-20          4   2020-07-01 2021-09-01   420
5  42-1_5-20          2   2020-05-01 2021-09-01   480
✅ Saved updated censored file to: /home/pintokf/Projects/Microbium/Mouses/Preprocess_ratio/Whole_data/preprocces_ratio_locate/locate_censored_level_6.csv


In [12]:
df_uncensored_clean.rename(columns={'SamplingDate': 'Date'}, inplace=True)
df_censored_clean.rename(columns={'SamplingDate': 'Date'}, inplace=True)
df_uncensored_clean.rename(columns={'death_date': 'DateEnd'}, inplace=True)
df_uncensored_clean.to_csv(path_uncensored, index=False)
df_censored_clean.to_csv(path_censored, index=False)