df_physicochemical comprises the selected features phychem_features and the whole set of parameters if necesary for plotting purposes.
The df_micro is the df of protein-genus features selected from the notebook 7_visual_proteins_ipnyb
micro_usuals is a dictionary with the list of proven bacteria influencing corrosion and could serve as label for plotting purposes
micro_markers is the dictionary with the list of bacteria belonging to the df_micro dataframe.

In [8]:
#import os
import sys
from pathlib import Path
# Data processing and analysis
import pandas as pd
import numpy as np
import openpyxl
import seaborn as sns
import networkx as nx
import community as community_louvain
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import matplotlib
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [9]:
# For Local
base_dir = Path("/home/beatriz/MIC/3_combined/data")
base_dir.mkdir(parents=True, exist_ok=True)

markers_path = base_dir /"combined_markers.xlsx"


In [10]:
# Deciding which environment
if Path("/kaggle").exists():
    print("Running in Kaggle environment")
    # For Kaggle work# Input datasets (read-only in Kaggle) # Files in small input directory
    base_dir = Path("/kaggle/input")  
    abundance_excel = base_dir / "new-picrust/merged_to_sequence.xlsx" # inside input small sizes input
    #Input market groups
    market_dir = base_dir / "markers"
    # Output dirs
    output_base = Path("/kaggle/working/")
    # Save the present working data
    combined_path = output_base /"combined_markers.xlsx"
    #Directory to keep  Results
    share_dir = output_base/"Visualisations"
    shared_dir.mkdir(parents=True, exist_ok=True)

else:
    print("Running in local (VSCode) environment")
    base_dir = Path("data")
    base_dir.mkdir(parents=True, exist_ok=True)
    # Base Paths for local environment
    abundance_excel = base_dir / "merged_to_sequence.xlsx"
    # This files are too large for github and are store on Kaggle for educational purposes
    output_large = Path("/home/beatriz/MIC/output_large")
    #Input market groups
    market_dir = output_large / "markers.parquet/"  # Directory
    output_base = base_dir 
    #Directory to keep some Results
    shared_dir= Path("/home/beatriz/SharedFolder/Visualisations/")
    combined_path = base_dir / "combined_markers.xlsx"


Running in local (VSCode) environment


### Importing the files

In [27]:
micro_df= pd.read_excel(markers_path, sheet_name='protein_markers',  engine ='openpyxl')
all_physichem = pd.read_excel(markers_path, sheet_name='all_physicochemical', engine ='openpyxl')
genus_site_mapping = pd.read_excel(markers_path, sheet_name='genus_to_sites', engine ='openpyxl')

In [28]:
physichem_features = ['Sites', 'Label', 'Temperature', 'Type', 'EC_M', 'O2_Eh',
                     'Ox_Fe_Zn', 'Cl_SO4_NO3', 'Na_K','pH_HPO4',
                       'Ca_HCO3_Mg', 'Cu_Al_Mn', 'Ni_Cr_Mo']
physichem_df = all_physichem[physichem_features]

In [29]:
micro_df = micro_markers.sort_values(by='score_combined', ascending=False).head(20)

In [33]:
# First, add the Sites column based on genera
def map_genus_to_sites(genus):
    # Return sites for a single genus (not a list of genera)
    return genus_site_mapping.get(genus, [])

# Apply the function to each genus individually
# First, explode the genera column to have one row per protein-genus pair
exploded_by_genus = micro_df.explode('genera')

# Now add the sites for each genus
exploded_by_genus['Sites'] = exploded_by_genus['genera'].apply(map_genus_to_sites)

# Explode again to have one row per protein-genus-site combination
fully_exploded_df = exploded_by_genus.explode('Sites')

# Remove rows where Sites is None or empty
#fully_exploded_df = fully_exploded_df[fully_exploded_df['Sites'].notna() & (fully_exploded_df['Sites'] != '')]

# Now join with physichem_df
#result_df = fully_exploded_df.merge(physichem_df, on='Sites', how='left')

In [34]:
fully_exploded_df.head()

Unnamed: 0.1,Unnamed: 0,protein_name,genera_count,genera,functional_categories,niche_pathways,enzyme_class,corrosion_mechanisms,score_combined,Sites
20,16,ferredoxin---nad+ reductase; ferredoxin-nicoti...,4,"['Acidisoma', 'Pseudorhodoferax', 'Methylocyst...","['h2_consumption', 'iron/sulfur_redox', 'acid_...","['xylene degradation', 'dioxin degradation', '...",Acting on iron-sulfur proteins as donors.,"['h2_consumption', 'direct_eet', 'sulfur_metab...",53.040891,
24,34,enoyl-[ reductase (nadh); enoyl-[ reductase; e...,3,"['Aestuariimicrobium', 'Thermincola', 'Trepone...","['h2_consumption', 'iron/sulfur_redox', 'acid_...","['lipid biosynthesis proteins', 'fatty acid bi...",Acting on the CH-CH group of donors.,"['h2_consumption', 'direct_eet', 'sulfur_metab...",51.298188,
1,1,enoyl-[ reductase [ (ec 1.3.1.9),11,"['Gallionella', 'Acidisoma', 'Mycoplana', 'Ano...","['h2_consumption', 'iron/sulfur_redox', 'acid_...","['lipid biosynthesis proteins', 'fatty acid bi...",Acting on the CH-CH group of donors.,"['h2_consumption', 'direct_eet', 'sulfur_metab...",50.718367,
35,26,siroheme-synthase [,3,"['Gallionella', 'Pseudomonas', 'Thiobacillus']","['h2_consumption', 'iron/sulfur_redox', 'acid_...","['nitrogen , nitrogen cycle']",Acting on other nitrogenous compounds as donors.,"['h2_consumption', 'direct_eet', 'nitrogen_met...",50.402663,
2,2,"enoyl-[ reductase (nadph, si-specific); acyl-a...",10,"['Gallionella', 'Acidisoma', 'Aestuariimicrobi...","['h2_consumption', 'iron/sulfur_redox', 'acid_...","['biotin', 'fatty acid biosynthesis', 'lipid b...",Acting on the CH-CH group of donors.,"['h2_consumption', 'direct_eet', 'sulfur_metab...",50.379373,


In [31]:
df_combined_features = pd.merge(micro_markers, physichem_markers, on = "Sites")
corr = df_combined_features.corr()[top_features].loc["O2_Eh", "pH_PO4", "Fe_Zn_Ox"]

KeyError: 'Sites'

In [None]:
I have a problem to solve I have a data with only 66 points and the identifier is named phyichem_df["Sites"] the parallel study identified protein_names which are related to "Genus" micro_df, and are like ["protein_name]: [list of genus] or it could be just the pair as well for each protein name a genus, which means same protein name for several genus, that is micro_df. I can actually get the "Sites" as well from this micro_df, but the thing is that each site can have same protein-genus pair... so this could be an approach to join the two df and I am no sure wether if just make my small one get inserted on my big one which will produce large amount of samples or to get the small one physichem_df and insert the protein-genus pairs do you understand the problem? at the moment i got no "Sites" on micro_df it is a matter of inputing them in the previous function that produce them or I could use a dictionary that has Sites:Genus
                                                                                                                                                                                                              
This is my micro_df  columns to serve as join the actual df is bigger                                                                                                                                                                                                                                                      protein_name  \
0                 3-oxoacyl-[ reductase (ec 1.1.1.100)   
1                     enoyl-[ reductase [ (ec 1.3.1.9)   
2    enoyl-[ reductase (nadph, si-specific); acyl-a...   
3    beta-ketoacyl-[ synthase iii (beta-ketoacyl-ac...   
4    glutathione hydrolase proenzyme (ec 2.3.2.2) 3...   
..                                                 ...   
96   medium-chain acyl-coa ligase; fadk (gene name)...   
97    udp-n-acetylglucosamine 4-epimerase (ec 5.1.3.7)   
98      maltose 6'-phosphate phosphatase (ec 3.1.3.90)   
99   membrane dipeptidase (ec 3.4.13.19) (peptidase...   
100  membrane dipeptidase (ec 3.4.13.19) (peptidase...   

                                                genera  
0    ['Gallionella', 'Acidisoma', 'Mycoplana', 'The...  
1    ['Gallionella', 'Acidisoma', 'Mycoplana', 'Ano...  
2    ['Gallionella', 'Acidisoma', 'Aestuariimicrobi...  
3    ['Acidisoma', 'Thermincola', 'Mycoplana', 'Ano...  
4    ['Flavisolibacter', 'Mycoplana', 'Pseudomonas'... 
      
and this is my phychem_df with "Sites" and I will have to input the "Genus" singular but I think
that is no nesary since I have to get them by a dict since the micro_df has the "Genus" as a list

 Sites  Label  Temperature  Type    EC_M     O2_Eh   Ox_Fe_Zn  Cl_SO4_NO3  \
0  site_1      2        23.00     0  690.40 -1.514130 -19.014017  -12.462139   
1  site_2      2        22.81     0  477.35 -3.218897 -27.631021  -11.728597   
2  site_3      3        18.80     2  651.00 -2.995939 -19.578501  -12.594547   
3  site_4      1        13.70     0  270.98 -3.506600 -19.536276  -17.727482   
4  site_5      2    
The other columns are shown here for ilustration purposes they are no for joining only Sites is to be used, since 
algorithms sometimes do no work i am worried that you will make it more complicated that actually is.


## Data Splitting Strategy

To ensure robust feature engineering and prevent data leakage, a portion of the data will be reserved for final model validation. The split must preserve the distribution of key factors affecting corrosion mechanisms:

1. **Corrosion Severity Label (Primary)** - Ensures balanced representation of corrosion levels and Essential for model evaluation across all severity classes

2. **Material Composition**: Different materials exhibit distinct corrosion mechanisms

3. **System Temperature Regime** Hot/Cold/Combined systems affect: Reaction kinetics, oxygen solubility, protective film formation, mineral precipitation tendencies

4. **Geographical Location**: Influences water chemistry through<. Different treatment regulations (chlorine vs. non-chlorine), regional geological variations in mineral content, country-specific water quality standards

5. **System Age (Secondary)**: Collection period: 2014-2018, while potentially relevant for corrosion progression it is considered less critical due to varying maintenance histories and treatment variations make precise temporal effects difficult to isolate

This stratified splitting approach ensures the test set remains representative while maintaining the independence necessary for valid model evaluation.

In [None]:
def get_material_group(df, material_column='Material'):
    """
    Groups materials based on cluster analysis findings and material properties.
    
    Key groupings:
    - Steel_group: Combines Stainless_Steel and Steel
    - GSP_group: Combines Galvanized_Steel and Galvanized_Steel_Plastic
    - Keeps Galvanized_Carbon_Steel separate due to distinct cluster behavior
    - Other materials remain separate for individual analysis
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame containing material information
    material_column : str, default='Material'
        Name of the column containing material names
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame with added 'material_group' column
    """

    # Create a copy of the material column to avoid modifying the original
    materials = df[material_column].copy()
    
    grouped_materials = []
    
    for material in materials:
        # Standardize material name
        material_strip = str(material).strip()
        
        # Group 1: Combine Stainless Steel and Steel
        if material_strip in ['Stainless_Steel', 'Steel']:
            grouped_materials.append('Steel_group')
        
        # Group 2: Combine Galvanized Steel and GSP
        elif material_strip in ['Galvanized_Steel', 'Galvanized_Steel_Plastic']:
            grouped_materials.append('GS_group')
        
        # Keep Galvanized Carbon Steel separate
        elif material_strip in ['Galvanized_Carbon_Steel', 'Galvanized_carbon_steel_plastic']:
            grouped_materials.append('GCS_group')
    
        # Keep Cooper as is
        elif material_strip == 'Cooper':
            grouped_materials.append('Cooper_group')
        
        # Keep Cross-Linked Polyethylene and THSP as others
        elif material_strip in ['Cross-Linked_Polyethylene','Tinplate_High-Strength_Plastic']:
            grouped_materials.append('CLP_THSP_group')
           
    # Add the grouped materials as a new column
    df['material_group'] = grouped_materials
    return df

#the indexes will be kept same as the whole df
def create_stratification_groups_v2(row):
    """
    Create comprehensive stratification groups
    """
    # Water treatment regime based on country
    water_regime = 'chlorine' if row['Country'] in ['Belgium', 'Netherlands'] else 'no_chlorine' 
    
    # Create stratification group string
    strat_group = f"{row['material_group']}_{water_regime}_label{row['Label']}" # _{row['Type']}_ I removed the Type condition because it divides the samples in too many classes. 
    
    return strat_group  

def split_dataset_v2(df, test_size=0.2, random_state=42):
    """
    Split dataset while maintaining distributions of key features and original indeces
    """
    #Creating a split column with default value, so the spliting can be applied to other dataframe
    df['split'] = 'train' # as default value
    
    # Add material group column
    df = get_material_group(df)
    #reindexing 
    df.index = original_indices
    # Create stratification groups
    df['strat_group'] = df.apply(create_stratification_groups_v2, axis=1)
    
    # Identify groups with sufficient samples
    group_counts = df['strat_group'].value_counts()
    large_groups = group_counts[group_counts >= 4].index
    small_groups = group_counts[group_counts < 4].index
    
    # Split data based on group size
    large_data = df[df['strat_group'].isin(large_groups)]
    small_data = df[df['strat_group'].isin(small_groups)]
    
    if len(large_data) > 0:
        # Stratified split for large groups
        train_large_idx, test_large_idx = train_test_split(
            large_data.index,
            test_size=test_size,
            random_state=random_state,
            stratify=large_data['strat_group']
        )
        
        if len(small_data) > 0:
            train_small_idx, test_small_idx = train_test_split(
                small_data.index,
                test_size=test_size,
                random_state=random_state,
                stratify=small_data['strat_group']
            )
            # Combine indices
            train_idx = np.concatenate([train_large_idx, train_small_idx])
            test_idx = np.concatenate([test_large_idx, test_small_idx])
            
        else:  # This else belongs to the small_data if
            train_idx = train_large_idx
            test_idx = test_large_idx
    else:  # This else belongs to the large_data if
        # Simple split if no large groups
        train_idx, test_idx = train_test_split(
            df.index,
            test_size=test_size,
            random_state=random_state
        )
    
    # Mark split in original dataframe
    df.loc[test_idx, 'split'] = 'test'
    
    # Return train and test dataframes with original indices
    return df[df['split'] == 'train'], df[df['split'] == 'test']
    
def analyze_split_results(train_df, test_df, original_df):
    """
    Analyze the distribution of features in the split datasets
    """
    print("=== Split Size Analysis ===")
    print(f"Total samples: {len(original_df)}")
    print(f"Training samples: {len(train_df)} ({len(train_df)/len(original_df)*100:.1f}%)")
    print(f"Test samples: {len(test_df)} ({len(test_df)/len(original_df)*100:.1f}%)")
    
    # Analyze distributions
    features = ['Material', 'Country', 'Label']
    
    for feature in features:
        print(f"\n=== {feature} Distribution ===")
        
        # Calculate distributions
        train_dist = train_df[feature].value_counts(normalize=True)
        test_dist = test_df[feature].value_counts(normalize=True)
        original_dist = original_df[feature].value_counts(normalize=True)
        
        # Combine into a DataFrame
        dist_df = pd.DataFrame({
            'Original %': original_dist * 100,
            'Train %': train_dist * 100,
            'Test %': test_dist * 100,
            'Original Count': original_df[feature].value_counts(),
            'Train Count': train_df[feature].value_counts(),
            'Test Count': test_df[feature].value_counts()
        }).round(2)
        
        print(dist_df)

In [None]:
grouped_materials_df = get_material_group(df_Meta_Split, material_column='Material')
# Apply the split
train_df, test_df = split_dataset_v2(grouped_materials_df)
# Analyzing the results
analyze_split_results(train_df, test_df, grouped_materials_df)

NameError: name 'df_Meta_Split' is not defined

In [None]:
# Now applying the split to original dataframe
split_mapping = grouped_materials_df['split']
original['split'] = original.index.map(split_mapping)

# Get train and test sets for original dataframe
original_train = original[original['split'] == 'train']
original_test = original[original['split'] == 'test']

In [None]:
# Verify indices match
print("Train indices match:", set(train_df.index) == set(original_train.index))
print("Test indices match:", set(test_df.index) == set(original_test.index))
print("No overlap between train and test:", len(set(train_df.index) & set(test_df.index)) == 0)

In [None]:
# now we assigne the original_train set to the dataframe to work in this notebook moving forward and we keep the original_test df to work for the model validation
df = original_train.drop(columns=['split']).copy() # .drop(columns=['Label']).values
df.head()

The initial strategy for dataset partitioning aimed to ensure a balanced representation across Location, Material, Type, and Label variables. Despite implementing sophisticated preprocessing steps - including hierarchical cluster analysis for material grouping and stratification of locations based on Cl- treatment protocols - the limited sample size (n=13) proved insufficient relative to the number of distinct classes, preventing a statistically valid split.

To address this limitation, Type was removed from the stratification criteria. This decision was supported by two key analytical findings:

Principal Component Analysis (PCA) in three dimensions demonstrated strong clustering patterns based on materials and locations alone, suggesting these features effectively capture the underlying data structure.

Feature importance analysis using XGBoost revealed that Composition accounts for approximately 50% of the variance in material type distribution across clusters, indicating that material properties are inherently captured through compositional data.

This modification to the stratification approach maintains the essential patterns in the data while enabling a more robust train-test split for subsequent analyses.

In [None]:
X_sel = df[top_features]
y = df['Category']

X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')  # or softmax for >2 classes
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=8)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:


# Data preparation
X_sel = df[top_features]
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.FloatTensor(y_train.values)
y_test_tensor = torch.FloatTensor(y_test.values)

# Define the neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.layer1 = nn.Linear(input_size, 32)
        self.layer2 = nn.Linear(32, 16)
        self.layer3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.relu(self.layer2(x))
        x = self.sigmoid(self.layer3(x))
        return x

# Initialize the model
input_size = X_train.shape[1]
model = SimpleNN(input_size)
print(f"Model created with input size: {input_size}")

# Define loss and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with smaller batches
num_epochs = 10
batch_size = 8  # Small batch size for memory efficiency
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    
    # Use smaller batches to reduce memory usage
    for i in range(0, len(X_train), batch_size):
        # Get batch
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size].view(-1, 1)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        total_loss += loss.item()
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    avg_loss = total_loss / (len(X_train) / batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Avg Loss: {avg_loss:.4f}')
    
    # Evaluate every epoch to track progress
    if (epoch + 1) % 2 == 0:  # Check every 2 epochs
        model.eval()  # Set model to evaluation mode
        with torch.no_grad():
            y_pred = model(X_test_tensor)
            y_pred_class = (y_pred > 0.5).float()
            accuracy = (y_pred_class.view(-1) == y_test_tensor).sum().item() / len(y_test_tensor)
            print(f'Validation Accuracy after epoch {epoch+1}: {accuracy:.4f}')

# Final evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_class = (y_pred > 0.5).float()
    accuracy = (y_pred_class.view(-1) == y_test_tensor).sum().item() / len(y_test_tensor)
    print(f'Final Accuracy: {accuracy:.4f}')
    
    # Print predictions for first few samples
    for i in range(min(5, len(y_test))):
        print(f"Sample {i+1}: Actual: {y_test.iloc[i]}, Predicted: {y_pred[i].item():.4f}, Class: {y_pred_class[i].item()}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.5 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.12/runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.12/runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "/home/beatriz/MIC/3_combined/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/beatriz/MIC/3_combined/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/home/beatriz

NameError: name 'df' is not defined