In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
# ====================================================================
# DATA PREPROCESSING: Weight% → Atomic% → Formula (CBFV Format)
# ====================================================================
# Atomic weights (g/mol)
ATOMIC_WEIGHTS = {
    'Fe': 55.845,
    'Si': 28.0855,
    'Al': 26.9815385
}

# Element order for formulas
ELEMENT_ORDER = ['Fe', 'Si', 'Al']


# ====================================================================
# STEP 1: Weight% → Atomic%
# ====================================================================

def weight_to_atomic_percent(df, elements=ELEMENT_ORDER):
    """Convert weight percentage to atomic percentage."""
    df = df.copy()

    # Calculate molar fractions (wt% / atomic weight)
    molar_fractions = {elem: df[elem] / ATOMIC_WEIGHTS[elem] for elem in elements}
    total_molar = sum(molar_fractions.values())

    # Convert to atomic percentages
    for elem in elements:
        df[elem] = (molar_fractions[elem] / total_molar) * 100

    return df


def normalize_to_100(df, elements=ELEMENT_ORDER, primary='Fe'):
    """Normalize atomic percentages to exactly 100.0% by adjusting primary element."""
    df = df.copy()

    # Round to 1 decimal and convert to tenths (integers)
    tenths = (df[elements] * 10).round().astype(int)

    # Adjust primary element to ensure sum = 1000 tenths (100.0%)
    diff = 1000 - tenths.sum(axis=1)
    tenths[primary] += diff

    # Convert back to percentages
    df[elements] = tenths / 10.0

    return df


# ====================================================================
# STEP 2: Atomic% → Formula (CBFV Format with Fractions)
# ====================================================================

def create_formula_fractions(row, elements=ELEMENT_ORDER):
    """Generate formula with atomic fractions: Fe0.80Si0.10Al0.10"""
    parts = [f"{elem}{row[elem]/100:.3f}" for elem in elements if row[elem] > 0]
    return ''.join(parts)


# ====================================================================
# MAIN PIPELINE
# ====================================================================

def preprocess_for_cbfv(input_csv, download=True):
    """
    Complete preprocessing pipeline: wt% → at% → formula (CBFV format)

    Input: CSV with columns ['Fe', 'Si', 'Al', ...properties...]
    Output: CSV with formula (fractions) and all properties for CBFV featurization
    """

    # Load data
    df = pd.read_csv(input_csv)
    print(f"Loaded {len(df)} samples from {input_csv}")

    # Validate columns
    missing = [col for col in ELEMENT_ORDER if col not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    # Identify property columns (everything except elements)
    property_cols = [col for col in df.columns if col not in ELEMENT_ORDER]

    # Step 1: Convert wt% to at%
    df_atomic = weight_to_atomic_percent(df[ELEMENT_ORDER], ELEMENT_ORDER)
    df_atomic = normalize_to_100(df_atomic, ELEMENT_ORDER, primary='Fe')

    print("\nSample conversion (first 3 rows):")
    print("Weight% → Atomic%")
    comparison = pd.DataFrame({
        'Fe_wt%': df['Fe'].head(3).values,
        'Fe_at%': df_atomic['Fe'].head(3).values,
        'Si_wt%': df['Si'].head(3).values,
        'Si_at%': df_atomic['Si'].head(3).values,
        'Al_wt%': df['Al'].head(3).values,
        'Al_at%': df_atomic['Al'].head(3).values,
    })
    print(comparison.to_string(index=False))

    # Step 2: Generate formula with fractions (CBFV format)
    df_atomic['formula'] = df_atomic.apply(create_formula_fractions, axis=1)

    # Step 3: Add properties back
    for prop in property_cols:
        df_atomic[prop] = df[prop].values

    # Prepare output file
    output_file = "Dataset_formula_fractions.csv"

    # Save: formula + properties (no atomic percentages)
    output_cols = ['formula'] + property_cols
    df_output = df_atomic[output_cols]
    df_output.to_csv(output_file, index=False)

    print(f"\n✓ Saved: {output_file}")
    print(f"✓ Columns: {output_cols}")

    # Download file in Colab
    if download:
        try:
            files.download(output_file)
        except:
            pass

    # Display sample
    print("\nSample output (first 5 rows):")
    print(df_output.head())

    return df_output


# ====================================================================
# EXECUTION
# ====================================================================

if __name__ == "__main__":
    try:
        # For Google Colab: upload file
        from google.colab import files
        print("Upload CSV file with weight percentages (columns: Fe, Si, Al, ...properties...):")
        uploaded = files.upload()

        if uploaded:
            filename = next(iter(uploaded))
            df_output = preprocess_for_cbfv(filename, download=True)
        else:
            print("No file uploaded.")

    except:
        # For local execution: use predefined path
        filename = '../data/training/Composition_dataset_wt%.csv'

        if os.path.exists(filename):
            print(f"Processing local file: {filename}")
            df_output = preprocess_for_cbfv(filename, download=False)
        else:
            print(f"⚠ File not found: {filename}")
            print("Please ensure the file exists at: data/training/Composition_dataset_wt%.csv")