In [None]:
import numpy as np
import pandas as pd

In [None]:
# Install packages for Google Colab
try:
    import google.colab
    !pip install -q CBFV
except:
    pass  # Packages already installed locally

In [None]:
try:
    # For Colab: upload file
    from google.colab import files
    uploaded = files.upload()
    filename = next(iter(uploaded))
    df = pd.read_csv(filename)
except:
    # For local execution: use predefined path
    df = pd.read_csv('../data/raw/Dataset_formula_fractions.csv')

print(f"Loaded {len(df)} samples with {len(df.columns)} columns")
display(df.head())

In [None]:
# Remove rows with missing Resistivity values
df_drop = df.dropna(subset=['Resistivity'])

# Reset index after dropping rows
df_index = df_drop.reset_index(drop=True)

# Select formula and target columns
df_final = df_index.loc[:, ['formula', 'Resistivity']]

# Rename Resistivity to target for CBFV
df_final.rename(columns={"Resistivity": "target"}, inplace=True)

In [None]:
%%capture
from CBFV import composition

# Generate CBFV features from composition formulas
features, target, formulae, skipped = composition.generate_features(
    df_final,
    elem_prop='oliynyk',           # Use Oliynyk element property set
    drop_duplicates=True,          # Remove duplicate compositions
    extend_features=False,         # Don't generate extended feature set
    sum_feat=True                  # Include sum statistics (avg, dev, sum)
)

In [None]:
# Remove low-variance features (keep only features with 8+ unique values)
features = features.loc[:, (features.nunique() >= 8)]

# Combine features and target into single dataframe
resulting_df = pd.concat([features, target], axis=1)

# Save raw CBFV features to CSV
resulting_df.to_csv('oliynyk_rho_raw.csv', index=False)

# Download file in Colab environment
try:
    files.download('oliynyk_rho_raw.csv')
except:
    pass  # Skip download for local execution