Secondary commodities based on byproduct ratios

1. merge byproduct ratios to mines to see how many have matching by product ratios --> refine code so that it ensured the right host is being used --> very few so only using averages
2. calculate the averages based on the byproducts ratio data - similar to the visual in paper
3. use averages to calculate byproduct cumprod based on primary cumprod

Byproduct ratios means based on ResC 

In [None]:
import pandas as pd
import numpy as np

def create_byproduct_host_matrix(file_path, sheet_name='results_accessible_ResC'):
    """
    Create a matrix showing mean ratios between byproducts (columns) and host commodities (rows)
    from the ResC data, similar to the heatmap in the image.
    """
    print("Reading ResC data...")
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Get all ratio columns (columns containing ' / ')
    ratio_columns = [col for col in df.columns if isinstance(col, str) and ' / ' in col]
    print(f"Found {len(ratio_columns)} ratio columns")
    
    # Parse ratio columns to extract byproduct and host metals
    parsed_ratios = []
    for col in ratio_columns:
        parts = col.split(' / ')
        if len(parts) == 2:
            byproduct = parts[0].strip()
            host = parts[1].strip()
            parsed_ratios.append({
                'column': col,
                'byproduct': byproduct,
                'host': host
            })
    
    print(f"Successfully parsed {len(parsed_ratios)} ratio combinations")
    
    # Create a list to store all ratios with their byproduct-host pairs
    ratio_data = []
    
    for ratio_info in parsed_ratios:
        col = ratio_info['column']
        byproduct = ratio_info['byproduct']
        host = ratio_info['host']
        
        # Get all non-null values for this ratio
        values = df[col].dropna()
        
        # Add each value with its byproduct-host pair
        for value in values:
            if pd.notna(value) and value > 0:  # Only positive ratios
                ratio_data.append({
                    'byproduct': byproduct,
                    'host': host,
                    'ratio': value
                })
    
    # Convert to DataFrame
    ratios_df = pd.DataFrame(ratio_data)
    print(f"Collected {len(ratios_df)} individual ratio measurements")
    
    # Calculate mean ratios for each byproduct-host combination
    mean_ratios = ratios_df.groupby(['host', 'byproduct'])['ratio'].mean().reset_index()
    
    # Create pivot table with hosts as rows and byproducts as columns
    matrix = mean_ratios.pivot(index='host', columns='byproduct', values='ratio')
    
    print(f"Created matrix with {len(matrix.index)} hosts and {len(matrix.columns)} byproducts")
    
    # Sort rows and columns for better presentation
    matrix = matrix.sort_index(axis=0)  # Sort hosts (rows)
    matrix = matrix.sort_index(axis=1)  # Sort byproducts (columns)
    
    # Fill NaN values with empty string for cleaner Excel output
    matrix_clean = matrix.fillna('')
    
    # Save to Excel
    output_path = "Data Input/Byproduct/Byproduct_Host_Matrix.xlsx"
    
    # Create Excel writer with formatting
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        # Write the main matrix
        matrix_clean.to_excel(writer, sheet_name='Byproduct_Host_Matrix')
        
        # Also save the numerical matrix (with NaN as NaN for potential coloring)
        matrix.to_excel(writer, sheet_name='Matrix_with_Numbers')
        
        # Save summary statistics
        summary_stats = pd.DataFrame({
            'Statistic': ['Total Hosts', 'Total Byproducts', 'Total Combinations', 'Non-empty Combinations'],
            'Value': [
                len(matrix.index),
                len(matrix.columns), 
                len(matrix.index) * len(matrix.columns),
                matrix.notna().sum().sum()
            ]
        })
        summary_stats.to_excel(writer, sheet_name='Summary', index=False)
        
        # Save list of all hosts
        hosts_df = pd.DataFrame({'Host_Commodities': sorted(matrix.index.tolist())})
        hosts_df.to_excel(writer, sheet_name='Host_List', index=False)
        
        # Save list of all byproducts  
        byproducts_df = pd.DataFrame({'Byproducts': sorted(matrix.columns.tolist())})
        byproducts_df.to_excel(writer, sheet_name='Byproduct_List', index=False)
    
    print(f"Matrix saved to: {output_path}")
    
    # Print summary
    print(f"\nMatrix Summary:")
    print(f"- Host commodities (rows): {len(matrix.index)}")
    print(f"- Byproducts (columns): {len(matrix.columns)}")
    print(f"- Total possible combinations: {len(matrix.index) * len(matrix.columns)}")
    print(f"- Combinations with data: {matrix.notna().sum().sum()}")
    print(f"- Data coverage: {matrix.notna().sum().sum() / (len(matrix.index) * len(matrix.columns)) * 100:.1f}%")
    
    # Show some examples of the highest and lowest ratios
    flat_matrix = matrix.stack()
    if len(flat_matrix) > 0:
        print(f"\nHighest ratios:")
        top_ratios = flat_matrix.nlargest(5)
        for (host, byproduct), ratio in top_ratios.items():
            print(f"  {byproduct}/{host}: {ratio:.2e}")
        
        print(f"\nLowest ratios:")
        bottom_ratios = flat_matrix.nsmallest(5)
        for (host, byproduct), ratio in bottom_ratios.items():
            print(f"  {byproduct}/{host}: {ratio:.2e}")
    
    return matrix

# File path
file_path = "Data Input/Byproduct/es4c05293_si_001(1).xlsx"

# Create the matrix
matrix = create_byproduct_host_matrix(file_path)

print("\nByproduct-Host matrix creation completed!")

Reading ResC data...
Found 383 ratio columns
Successfully parsed 383 ratio combinations
Collected 7422 individual ratio measurements
Created matrix with 36 hosts and 58 byproducts
Matrix saved to: Data Input/Byproduct/Byproduct_Host_Matrix.xlsx

Matrix Summary:
- Host commodities (rows): 36
- Byproducts (columns): 58
- Total possible combinations: 2088
- Combinations with data: 383
- Data coverage: 18.3%

Highest ratios:
  Fe/Pd: 2.74e+05
  Fe/Au: 2.22e+05
  Cu/Re: 2.16e+04
  Zn/Au: 7.56e+03
  Pb/Au: 4.05e+03

Lowest ratios:
  Au/Fe: 2.47e-07
  Au/Cr: 1.43e-06
  Ir/Cr: 1.59e-06
  Sb/Ni: 1.61e-06
  Rh/Cu: 1.78e-06

Byproduct-Host matrix creation completed!


Cumprod byproduct calculation

1. multiply ratios from matrix dataset creating new columns for all combinations 

In [21]:
import pandas as pd
import numpy as np

# Create mapping from full commodity names to metal symbols
commodity_to_symbol = {
    'Bauxite': 'Al',
    'Cobalt': 'Co', 
    'Copper': 'Cu',
    # 'Graphite': skipped entirely (usually no byproducts)
    'Iron Ore': 'Fe',
    'Lanthanides': 'La',  # Special handling for lanthanides
    'Lead': 'Pb',
    'Lithium': 'Li',
    'Manganese': 'Mn',
    'Molybdenum': 'Mo',
    'Nickel': 'Ni',
    'Niobium': 'Nb',
    'Tin': 'Sn',
    'Zinc': 'Zn',
    'Gold': 'Au',
    'Silver': 'Ag',
    'Platinum': 'Pt',
    'Palladium': 'Pd',
    'Rhenium': 'Re',
    'Cadmium': 'Cd',
    'Indium': 'In',
    'Gallium': 'Ga',
    'Germanium': 'Ge',
    'Selenium': 'Se',
    'Tellurium': 'Te',
    'Bismuth': 'Bi',
    'Antimony': 'Sb',
    'Tungsten': 'W',
    'Chromium': 'Cr',
    'Vanadium': 'V',
    'Titanium': 'Ti',
    'Aluminum': 'Al',
    'Iron': 'Fe',
    'Magnesium': 'Mg',
    'Phosphorus': 'P',
    'Potassium': 'K',
    'Sulfur': 'S',
    'Cerium': 'Ce',
    'Dysprosium': 'Dy',
    'Erbium': 'Er',
    'Europium': 'Eu',
    'Gadolinium': 'Gd',
    'Holmium': 'Ho',
    'Iridium': 'Ir',
    'Lanthanum': 'La',
    'Lutetium': 'Lu',
    'Neodymium': 'Nd',
    'Praseodymium': 'Pr',
    'Rhodium': 'Rh',
    'Ruthenium': 'Ru',
    'Samarium': 'Sm',
    'Scandium': 'Sc',
    'Silicon': 'Si',
    'Tantalum': 'Ta',
    'Terbium': 'Tb',
    'Thulium': 'Tm',
    'Uranium': 'U',
    'Yttrium': 'Y',
    'Ytterbium': 'Yb',
    'Zirconium': 'Zr',
    'Hafnium': 'Hf',
    'Boron': 'B'
}

def calculate_byproduct_production():
    # Read the ratio matrix
    print("Loading byproduct-host matrix...")
    matrix_df = pd.read_excel("Data Input/Byproduct/Byproduct_Host_Matrix.xlsx", 
                             sheet_name='Byproduct_Host_Matrix', index_col=0)
    
    # Read the cumulative production data
    print("Loading cumulative production data...")
    cumprod_df = pd.read_excel("Data Input/Byproduct/Byproducts_CumProd_final.xlsx")
    
    print(f"Matrix dimensions: {matrix_df.shape}")
    print(f"Cumulative production data: {len(cumprod_df)} rows")
    
    # Define lanthanide symbols for averaging
    lanthanides = ['La', 'Ce', 'Pr', 'Nd', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu']
    
    # Create new columns for each possible byproduct
    all_byproducts = set()
    
    # Process each mine
    graphite_mines_skipped = 0
    
    for index, row in cumprod_df.iterrows():
        prop_id = row['PROP_ID']
        primary_commodity = row['PRIMARY_COMMODITY']
        commodities_list = row.get('COMMODITIES_LIST', '')
        cum_prod = row.get('CumProd_Tonnes', 0)
        
        # Skip if no cumulative production
        if pd.isna(cum_prod) or cum_prod == 0:
            continue
        
        # Skip graphite mines entirely (keep them but don't process byproducts)
        if primary_commodity == 'Graphite':
            graphite_mines_skipped += 1
            continue
            
        # Handle lanthanides specially
        if primary_commodity == 'Lanthanides':
            # Calculate average ratios for lanthanides
            print(f"Processing lanthanides for mine {prop_id}")
            
            # Parse byproducts list
            if pd.isna(commodities_list):
                commodities_list = ''
            
            byproducts = [x.strip() for x in str(commodities_list).split(',') if x.strip()]
            
            # Calculate byproduct production for each commodity
            for byproduct_name in byproducts:
                # Get byproduct symbol
                byproduct_symbol = commodity_to_symbol.get(byproduct_name)
                if not byproduct_symbol:
                    print(f"Warning: No symbol mapping for byproduct '{byproduct_name}' in mine {prop_id}")
                    continue
                
                # Create column name
                column_name = f"Byproduct_{byproduct_name}_Tonnes"
                all_byproducts.add(column_name)
                
                # Initialize column if it doesn't exist
                if column_name not in cumprod_df.columns:
                    cumprod_df[column_name] = np.nan
                
                # Calculate average ratio across available lanthanides
                lanthanide_ratios = []
                for lanthanide in lanthanides:
                    if lanthanide in matrix_df.index and byproduct_symbol in matrix_df.columns:
                        ratio = matrix_df.loc[lanthanide, byproduct_symbol]
                        if pd.notna(ratio) and ratio > 0:
                            lanthanide_ratios.append(ratio)
                
                if lanthanide_ratios:
                    # Use average ratio
                    avg_ratio = np.mean(lanthanide_ratios)
                    byproduct_production = cum_prod * avg_ratio
                    cumprod_df.at[index, column_name] = byproduct_production
                    print(f"  {byproduct_name}: Used average of {len(lanthanide_ratios)} lanthanide ratios = {avg_ratio:.2e}")
        
        else:
            # Handle non-lanthanide commodities
            # Get primary commodity symbol
            primary_symbol = commodity_to_symbol.get(primary_commodity)
            if not primary_symbol:
                print(f"Warning: No symbol mapping for primary commodity '{primary_commodity}' in mine {prop_id}")
                continue
            
            # Parse byproducts list
            if pd.isna(commodities_list):
                commodities_list = ''
            
            byproducts = [x.strip() for x in str(commodities_list).split(',') if x.strip()]
            
            # Calculate byproduct production for each commodity
            for byproduct_name in byproducts:
                # Get byproduct symbol
                byproduct_symbol = commodity_to_symbol.get(byproduct_name)
                if not byproduct_symbol:
                    print(f"Warning: No symbol mapping for byproduct '{byproduct_name}' in mine {prop_id}")
                    continue
                
                # Create column name
                column_name = f"Byproduct_{byproduct_name}_Tonnes"
                all_byproducts.add(column_name)
                
                # Initialize column if it doesn't exist
                if column_name not in cumprod_df.columns:
                    cumprod_df[column_name] = np.nan
                
                # Get ratio from matrix
                try:
                    if primary_symbol in matrix_df.index and byproduct_symbol in matrix_df.columns:
                        ratio = matrix_df.loc[primary_symbol, byproduct_symbol]
                        
                        if pd.notna(ratio) and ratio > 0:
                            # Calculate byproduct production
                            byproduct_production = cum_prod * ratio
                            cumprod_df.at[index, column_name] = byproduct_production
                            
                except Exception as e:
                    print(f"Error processing {prop_id} - {byproduct_name}: {e}")
                    continue
    
    print(f"Created {len(all_byproducts)} byproduct columns")
    
    if graphite_mines_skipped > 0:
        print(f"Skipped {graphite_mines_skipped} graphite mines (no byproducts calculated)")
    
    # Save the updated file
    output_file = "Data Input/Byproduct/Byproducts_CumProd_final.xlsx"
    print(f"Saving results to {output_file}...")
    cumprod_df.to_excel(output_file, index=False)
    
    # Print summary statistics
    print("\nSummary:")
    total_mines = len(cumprod_df)
    mines_with_byproducts = 0
    
    for col in all_byproducts:
        if col in cumprod_df.columns:
            non_null_count = cumprod_df[col].notna().sum()
            if non_null_count > 0:
                mines_with_byproducts = max(mines_with_byproducts, non_null_count)
                total_production = cumprod_df[col].sum()
                print(f"  {col}: {non_null_count} mines, Total: {total_production:.2e} tonnes")
    
    print(f"\nTotal mines: {total_mines}")
    print(f"Mines with calculated byproducts: {mines_with_byproducts}")
    print("Process completed successfully!")

if __name__ == "__main__":
    calculate_byproduct_production()

Loading byproduct-host matrix...
Loading cumulative production data...
Matrix dimensions: (36, 58)
Cumulative production data: 598 rows
Created 34 byproduct columns
Saving results to Data Input/Byproduct/Byproducts_CumProd_final.xlsx...

Summary:
  Byproduct_Silver_Tonnes: 206 mines, Total: 7.42e+05 tonnes
  Byproduct_Nickel_Tonnes: 4 mines, Total: 6.98e+06 tonnes
  Byproduct_Tungsten_Tonnes: 1 mines, Total: 7.52e+04 tonnes
  Byproduct_Zinc_Tonnes: 34 mines, Total: 8.67e+07 tonnes
  Byproduct_Palladium_Tonnes: 15 mines, Total: 2.96e+03 tonnes
  Byproduct_Indium_Tonnes: 1 mines, Total: 5.95e+01 tonnes
  Byproduct_Gold_Tonnes: 157 mines, Total: 2.49e+04 tonnes
  Byproduct_Cobalt_Tonnes: 57 mines, Total: 3.50e+06 tonnes
  Byproduct_Selenium_Tonnes: 4 mines, Total: 2.66e+03 tonnes
  Byproduct_Tellurium_Tonnes: 2 mines, Total: 2.21e+01 tonnes
  Byproduct_Molybdenum_Tonnes: 60 mines, Total: 1.37e+07 tonnes
  Byproduct_Lead_Tonnes: 130 mines, Total: 8.63e+07 tonnes
  Byproduct_Manganese_Tonne

Stripping ratio
1) check S&P if available --> only 62 out of 750 mines have stripping ratios 
2) if not from S&P then use global stripping ratios

In [22]:
import pandas as pd
import numpy as np

# File path
target_file = "Data Input/Byproduct/Byproducts_CumProd_final.xlsx"

print("Loading existing Stripping_Ratio sheet...")
target_sheet_df = pd.read_excel(target_file, sheet_name='Stripping_Ratio')

print(f"Target sheet data: {len(target_sheet_df)} rows")

# Global stripping ratios by commodity
global_ratios = {
    'Aluminium': 0.70,
    'Bauxite': 0.70,      # Same as Aluminium  
    'Chromium': 2.08,
    'Cobalt': 2.12,
    'Copper': 1.86,
    'Gallium': 1.90,
    'Gold': 2.86,
    'Iridium': 1.97,
    'Iron': 1.96,
    'Iron Ore': 1.96,     # Same as Iron
    'Lithium': 5.53,
    'Magnesium': 0.00,
    'Manganese': 1.35, # based on Battery Hill SNL derived stripping ratio
    'Molybdenum': 1.87,
    'Nickel': 1.36,
    'Palladium': 2.20,
    'Platinum': 2.28,
    'Rhodium': 2.12,
    'Ruthenium': 1.98,
    'Silicon': 1.22,
    'Silver': 2.13,
    'Tantalum': 0.72,
    'Tin': 0.03,
    'Titanium': 0.11,
    'Tungsten': 1.34,
    'Vanadium': 2.53,
    'Zinc': 2.45,
    'Zirconium': 0.11,
    # Rare Earth Elements
    'Yttrium': 0.80,
    'Lanthanum': 0.93,
    'Cerium': 1.44,
    'Praseodymium': 0.89,
    'Neodymium': 0.86,
    'Samarium': 0.82,
    'Europium': 0.85,
    'Gadolinium': 0.81,
    'Terbium': 0.81,
    'Dysprosium': 0.80,
    'Holmium': 0.80,
    'Erbium': 0.80,
    'Thulium': 0.80,
    'Ytterbium': 0.80,
    'Lutetium': 0.80,
    # Average for general Lanthanides (calculated from individual lanthanide values)
    'Lanthanides': 0.86   # Average of all individual lanthanide ratios
}

# Get all unique commodities from the target data
all_commodities = set()

# Add primary commodities
if 'PRIMARY_COMMODITY' in target_sheet_df.columns:
    all_commodities.update(target_sheet_df['PRIMARY_COMMODITY'].dropna().unique())

# Add commodities from COMMODITIES_LIST
if 'COMMODITIES_LIST' in target_sheet_df.columns:
    for commodities_str in target_sheet_df['COMMODITIES_LIST'].dropna():
        commodities = [x.strip() for x in str(commodities_str).split(',') if x.strip()]
        all_commodities.update(commodities)

print(f"Found {len(all_commodities)} unique commodities")

# Create columns for each commodity
for commodity in sorted(all_commodities):
    column_name = f"{commodity}_SR"
    target_sheet_df[column_name] = np.nan

# Fill the commodity stripping ratio columns
global_ratio_count = 0
existing_values_kept = 0

for index, row in target_sheet_df.iterrows():
    prop_id = row['PROP_ID']
    primary_commodity = row.get('PRIMARY_COMMODITY', '')
    commodities_list = row.get('COMMODITIES_LIST', '')
    
    # Parse commodities list
    if pd.notna(commodities_list):
        commodities = [x.strip() for x in str(commodities_list).split(',') if x.strip()]
    else:
        commodities = []
    
    # Add primary commodity to the list if it exists
    if pd.notna(primary_commodity) and primary_commodity:
        commodities.append(primary_commodity)
    
    # Remove duplicates
    commodities = list(set(commodities))
    
    # Fill stripping ratios for each commodity this mine produces
    for commodity in commodities:
        column_name = f"{commodity}_SR"
        if column_name in target_sheet_df.columns:
            # Check if there's already a value in this cell
            current_value = target_sheet_df.at[index, column_name]
            if pd.notna(current_value):
                # Keep existing value, don't overwrite
                existing_values_kept += 1
            elif commodity in global_ratios:
                # Only fill if cell is empty
                target_sheet_df.at[index, column_name] = global_ratios[commodity]
                global_ratio_count += 1

# Find commodities without stripping ratios
commodities_without_ratios = []
for commodity in sorted(all_commodities):
    if commodity not in global_ratios:
        commodities_without_ratios.append(commodity)

# Save back to the Stripping_Ratio sheet
print("Saving updated Stripping_Ratio sheet...")
with pd.ExcelWriter(target_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    target_sheet_df.to_excel(writer, sheet_name='Stripping_Ratio', index=False)

# Report results
print("\nResults:")
print(f"Created {len(all_commodities)} commodity stripping ratio columns")
print(f"Existing values kept (not overwritten): {existing_values_kept}")
print(f"Global ratios applied to empty cells: {global_ratio_count}")
print(f"Total stripping ratio values: {existing_values_kept + global_ratio_count}")

if commodities_without_ratios:
    print(f"\nCommodities WITHOUT stripping ratios ({len(commodities_without_ratios)}):")
    for commodity in commodities_without_ratios:
        print(f"  - {commodity}")
else:
    print("\nAll commodities have stripping ratios available!")

print("\nMerge completed successfully!")

Loading existing Stripping_Ratio sheet...
Target sheet data: 598 rows
Found 56 unique commodities
Saving updated Stripping_Ratio sheet...

Results:
Created 56 commodity stripping ratio columns
Existing values kept (not overwritten): 0
Global ratios applied to empty cells: 1312
Total stripping ratio values: 1312

Commodities WITHOUT stripping ratios (31):
  - Antimony
  - Arsenic
  - Barite
  - Borates
  - Cadmium
  - Caesium
  - Chromite
  - Diamonds
  - Ferronickel
  - Germanium
  - Graphite
  - Ilmenite
  - Indium
  - Kaolin
  - Lead
  - Limestone
  - Magnetite
  - Niobium
  - Osmium
  - Phosphate
  - Potash
  - Potassium Sulfate
  - Rhenium
  - Rubidium
  - Selenium
  - Silica
  - Spodumene
  - Strontium
  - Tellurium
  - U3O8
  - Zinc-Lead

Merge completed successfully!


1) mass based allocation of ore tonnage
2) add recovery rates from Greffe
3) calculate RMR's 
4) median imputation for missing RMR's based on calculated

In [24]:
import pandas as pd
import numpy as np

# File path
file_path = "Data Input/Byproduct/Byproducts_Cumprod_final.xlsx"

# Read the two sheets
print("Reading data...")
cumprod_df = pd.read_excel(file_path, sheet_name='CumProd')
ore_tonnage_df = pd.read_excel(file_path, sheet_name='Ore_Tonnage')

print(f"CumProd sheet: {len(cumprod_df)} rows")
print(f"Ore_Tonnage sheet: {len(ore_tonnage_df)} rows")

# Get primary and byproduct columns
all_columns = cumprod_df.columns.tolist()
primary_column = 'CumProd_Tonnes'

if primary_column not in all_columns:
    print(f"Error: Primary column '{primary_column}' not found in CumProd sheet")
    exit(1)

primary_idx = all_columns.index(primary_column)
# Get all columns after CumProd_Tonnes as potential byproduct columns
byproduct_columns = all_columns[primary_idx+1:]

# Calculate commodity shares for each mine
mine_commodity_shares = {}
mines_processed = 0

for _, row in cumprod_df.iterrows():
    prop_id = row['PROP_ID']
    primary_commodity = row.get('PRIMARY_COMMODITY', 'Primary')
    
    # Get primary production
    primary_value = row.get(primary_column, 0)
    if pd.isna(primary_value):
        primary_value = 0
    
    # Calculate total production (primary + byproducts)
    production_values = {}
    
    # Add primary production
    if primary_value > 0:
        production_values[primary_commodity] = primary_value
    
    # Add byproduct production
    for col in byproduct_columns:
        value = row.get(col, 0)
        if pd.notna(value) and value > 0:
            # Extract commodity name from byproduct column "Byproduct_[commodity]_Tonnes"
            if col.startswith('Byproduct_') and '_Tonnes' in col:
                # Format: "Byproduct_Gold_Tonnes" -> Extract "Gold"
                commodity_name = col.replace('Byproduct_', '').replace('_Tonnes', '')
            else:
                # Use column name as is
                commodity_name = col
            
            production_values[commodity_name] = value
    
    # Calculate total production
    total_production = sum(production_values.values())
    
    if total_production > 0:
        # Calculate commodity shares
        shares = {commodity: value/total_production for commodity, value in production_values.items()}
        mine_commodity_shares[prop_id] = shares
        mines_processed += 1

print(f"Calculated commodity shares for {mines_processed} mines")

# Create commodity ore columns in ore_tonnage_df
# First, collect all unique commodities
all_commodities = set()
for shares in mine_commodity_shares.values():
    all_commodities.update(shares.keys())

print(f"Found {len(all_commodities)} unique commodities")

# Add new columns for each commodity
for commodity in sorted(all_commodities):
    column_name = f'{commodity}_Ore'
    ore_tonnage_df[column_name] = np.nan

# Fill commodity ore tonnage values
mines_with_ore = 0
mines_without_ore = 0

for index, row in ore_tonnage_df.iterrows():
    prop_id = row['PROP_ID']
    ore_tonnage = row.get('Cum_Ore_Tonnage')
    
    if pd.isna(ore_tonnage):
        mines_without_ore += 1
        continue
    
    if prop_id in mine_commodity_shares:
        mines_with_ore += 1
        shares = mine_commodity_shares[prop_id]
        for commodity, share in shares.items():
            column_name = f'{commodity}_Ore'
            ore_tonnage_df.at[index, column_name] = ore_tonnage * share

print(f"Mines with ore tonnage: {mines_with_ore}")
print(f"Mines without ore tonnage: {mines_without_ore}")

# Save the updated file
print("Saving updated file...")
with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    ore_tonnage_df.to_excel(writer, sheet_name='Ore_Tonnage', index=False)

print("Processing completed successfully!")

Reading data...
CumProd sheet: 598 rows
Ore_Tonnage sheet: 598 rows
Calculated commodity shares for 598 mines
Found 24 unique commodities
Mines with ore tonnage: 367
Mines without ore tonnage: 231
Saving updated file...
Processing completed successfully!


In [28]:
import pandas as pd
import numpy as np

# File path
file_path = "Data Input/Byproduct/Byproducts_Cumprod_final.xlsx"

print("Reading data from all sheets...")
# Read all sheets
cumprod_df = pd.read_excel(file_path, sheet_name='CumProd')
stripping_df = pd.read_excel(file_path, sheet_name='Stripping_Ratio')
ore_df = pd.read_excel(file_path, sheet_name='Ore_Tonnage')
recovery_df = pd.read_excel(file_path, sheet_name='Recovery_Rates')

# Try to read RMR sheet, create if it doesn't exist
try:
    rmr_df = pd.read_excel(file_path, sheet_name='RMR')
    print("Found existing RMR sheet.")
except:
    print("RMR sheet not found. Creating new sheet.")
    # Create RMR sheet with PROP_ID column
    rmr_df = pd.DataFrame(columns=['PROP_ID'])
    # Copy PROP_ID values from one of the other sheets
    rmr_df['PROP_ID'] = ore_df['PROP_ID'].copy()

print(f"CumProd: {len(cumprod_df)} rows")
print(f"Stripping Ratio: {len(stripping_df)} rows")
print(f"Ore_Tonnage: {len(ore_df)} rows")
print(f"Recovery Rates: {len(recovery_df)} rows")
print(f"RMR: {len(rmr_df)} rows")

# Create a dictionary of recovery rates
recovery_rates = {}
for _, row in recovery_df.iterrows():
    commodity = row['Name']
    recovery_rate = row['RR_ALL']
    if pd.notna(recovery_rate):
        recovery_rates[commodity] = recovery_rate

print(f"Found recovery rates for {len(recovery_rates)} commodities")

# Find all available commodities across sheets
all_commodities = set()

# Check Ore_Tonnage sheet for commodity_Ore columns
ore_columns = [col for col in ore_df.columns if col.endswith('_Ore')]
for col in ore_columns:
    commodity = col.replace('_Ore', '')
    all_commodities.add(commodity)

# Check Stripping Ratio sheet for commodity_SR columns
sr_columns = [col for col in stripping_df.columns if col.endswith('_SR')]
for col in sr_columns:
    commodity = col.replace('_SR', '')
    all_commodities.add(commodity)

# Check CumProd for Byproduct_commodity_Tonnes columns
byproduct_columns = [col for col in cumprod_df.columns if col.startswith('Byproduct_') and col.endswith('_Tonnes')]
for col in byproduct_columns:
    commodity = col.replace('Byproduct_', '').replace('_Tonnes', '')
    all_commodities.add(commodity)

# Also add the primary production commodity
if 'PRIMARY_COMMODITY' in cumprod_df.columns and 'CumProd_Tonnes' in cumprod_df.columns:
    primary_commodities = cumprod_df['PRIMARY_COMMODITY'].unique()
    for commodity in primary_commodities:
        if pd.notna(commodity):
            all_commodities.add(commodity)

print(f"Found {len(all_commodities)} unique commodities across all sheets")

# Create RMR columns for each commodity
for commodity in all_commodities:
    rmr_column_name = f"{commodity}_RMR"
    if rmr_column_name not in rmr_df.columns:
        rmr_df[rmr_column_name] = np.nan

# Calculate RMR for each mine and commodity
mines_processed = 0
commodities_calculated = 0

# Track missing data for each mine
missing_data = {}

# Process each mine directly from CumProd sheet to identify primary and byproduct commodities
for _, cumprod_row in cumprod_df.iterrows():
    prop_id = cumprod_row['PROP_ID']
    primary_commodity = cumprod_row.get('PRIMARY_COMMODITY')
    primary_production = cumprod_row.get('CumProd_Tonnes', 0)
    
    # Skip if no primary commodity or it's not valid
    if not pd.notna(primary_commodity) or not pd.notna(primary_production):
        continue
    
    # Initialize tracking for this mine
    if prop_id not in missing_data:
        missing_data[prop_id] = {'missing_commodities': [], 'missing_reasons': {}}
    
    # Find this mine in other sheets
    ore_row = ore_df[ore_df['PROP_ID'] == prop_id]
    sr_row = stripping_df[stripping_df['PROP_ID'] == prop_id]
    rmr_idx = rmr_df[rmr_df['PROP_ID'] == prop_id].index
    
    if rmr_idx.empty or ore_row.empty or sr_row.empty:
        # Track major missing data
        if rmr_idx.empty:
            missing_data[prop_id]['missing_reasons']['all'] = ['mine not in RMR sheet']
        if ore_row.empty:
            missing_data[prop_id]['missing_reasons']['all'] = ['no ore tonnage data']
        if sr_row.empty:
            missing_data[prop_id]['missing_reasons']['all'] = ['no stripping ratio data']
        continue
    
    # Get index in RMR sheet
    rmr_idx = rmr_idx[0]
    mine_has_data = False
    
    # Collect commodities to process - start with primary commodity
    commodities_to_process = []
    if pd.notna(primary_production) and primary_production > 0:
        commodities_to_process.append((primary_commodity, primary_production))
        
    # Add byproduct commodities with non-zero values
    for col in byproduct_columns:
        byproduct_value = cumprod_row.get(col, 0)
        if pd.notna(byproduct_value) and byproduct_value > 0:
            byproduct_name = col.replace('Byproduct_', '').replace('_Tonnes', '')
            commodities_to_process.append((byproduct_name, byproduct_value))
    
    # Process each commodity for this mine
    for commodity, metal_production in commodities_to_process:
        ore_column = f"{commodity}_Ore"
        sr_column = f"{commodity}_SR"
        rmr_column = f"{commodity}_RMR"
        
        # Track missing data elements
        missing_elements = []
        
        # Check each required data element
        if ore_column not in ore_row.columns or pd.isna(ore_row[ore_column].iloc[0]):
            missing_elements.append('ore tonnage')
        
        if sr_column not in sr_row.columns or pd.isna(sr_row[sr_column].iloc[0]):
            missing_elements.append('stripping ratio')
        
        if commodity not in recovery_rates:
            missing_elements.append('recovery rate')
        
        # Create the column in RMR sheet if it doesn't exist
        if rmr_column not in rmr_df.columns:
            rmr_df[rmr_column] = np.nan
        
        # If we have all required data, calculate RMR
        if not missing_elements:
            # Get the data
            ore_tonnage = ore_row[ore_column].iloc[0]
            stripping_ratio = sr_row[sr_column].iloc[0]
            recovery_rate = recovery_rates[commodity]
            
            # Final value checks
            if recovery_rate <= 0:
                missing_elements.append('invalid recovery rate (≤0)')
            else:
                # Calculate waste tonnage
                waste_tonnage = ore_tonnage * stripping_ratio
                
                # Calculate RMR
                rmr = (ore_tonnage + waste_tonnage) / (metal_production * recovery_rate)
                
                # Add to dataframe
                rmr_df.at[rmr_idx, rmr_column] = rmr
                commodities_calculated += 1
                mine_has_data = True
        
        # If this commodity couldn't be calculated, track what's missing
        if missing_elements:
            missing_data[prop_id]['missing_commodities'].append(commodity)
            missing_data[prop_id]['missing_reasons'][commodity] = missing_elements
    
    if mine_has_data:
        mines_processed += 1

print(f"Calculated RMR for {mines_processed} mines and {commodities_calculated} commodity instances")

# Report missing commodities
print("\nMissing Data Summary:")
print("-" * 60)

mines_with_missing = 0
for prop_id, data in missing_data.items():
    missing_count = len(data['missing_commodities'])
    if missing_count > 0:
        mines_with_missing += 1
        print(f"Mine {prop_id}: Missing {missing_count} commodities")
        
        # Group by reason to avoid lengthy output
        reason_groups = {}
        for commodity, reasons in data['missing_reasons'].items():
            reason_key = ', '.join(sorted(reasons))
            if reason_key not in reason_groups:
                reason_groups[reason_key] = []
            if commodity != 'all':
                reason_groups[reason_key].append(commodity)
        
        # Print grouped by reason
        for reason, commodities in reason_groups.items():
            if len(commodities) <= 5:
                print(f"  - Missing {reason} for: {', '.join(commodities)}")
            else:
                print(f"  - Missing {reason} for: {len(commodities)} commodities including {', '.join(commodities[:5])}...")

print("-" * 60)
print(f"Total mines with missing data: {mines_with_missing} out of {len(rmr_df)}")

# Save RMR sheet to file
print("\nSaving RMR sheet to file...")
with pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    rmr_df.to_excel(writer, sheet_name='RMR', index=False)

print("RMR calculation complete!")

Reading data from all sheets...
Found existing RMR sheet.
CumProd: 598 rows
Stripping Ratio: 598 rows
Ore_Tonnage: 598 rows
Recovery Rates: 64 rows
RMR: 598 rows
Found recovery rates for 64 commodities
Found 56 unique commodities across all sheets
Calculated RMR for 367 mines and 894 commodity instances

Missing Data Summary:
------------------------------------------------------------
Mine 62580: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 55209: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 59898: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 62576: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 68363: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 58546: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 66249: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 68362: Missing 1 commodities
  - Missing ore tonnage for: Bauxite
Mine 58547: Missing 1 commodities
  - Missi

Median based RMR imputation
1) 231 out of 598 mines dont have RMR values (38%)

In [29]:
import pandas as pd
import numpy as np

# File path
file_path = "Data Input/Byproduct/Byproducts_CumProd_final.xlsx"
output_file_path = "Data Input/Byproduct/Byproducts_CumProd_final.xlsx"

# Define the mapping of countries to regions
COUNTRY_TO_REGION = {
    # Africa
    'Ghana': 'Africa',
    'Guinea': 'Africa',
    'Sierra Leone': 'Africa',
    'South Africa': 'Africa',
    'Mozambique': 'Africa',
    'Zimbabwe': 'Africa',
    'Botswana': 'Africa',
    'Tanzania': 'Africa',
    'Morocco': 'Africa',
    'Dem. Rep. Congo': 'Africa',
    'Zambia': 'Africa',
    'Namibia': 'Africa',
    'Mauritania': 'Africa',
    'Madagascar': 'Africa',
    'Liberia': 'Africa',
    'Algeria': 'Africa',
    'Gabon': 'Africa',
    'Rwanda': 'Africa',
    'Eritrea': 'Africa',
    'Burkina Faso': 'Africa',
   
    # Asia
    'Saudi Arabia': 'Middle East',
    'India': 'Asia',
    'China': 'Asia',
    'Kazakhstan': 'Asia',
    'Russia': 'Eurasia',
    'Indonesia': 'Asia',
    'Mongolia': 'Asia',
    'Uzbekistan': 'Asia',
    'Bangladesh': 'Asia',
    'Philippines': 'Asia',
    'Türkiye': 'Eurasia',
    'Pakistan': 'Asia',
    'Thailand': 'Asia',
    'Myanmar': 'Asia',
    'Laos': 'Asia',
    'Iran': 'Middle East',
    'Armenia': 'Eurasia',
    'Sri Lanka': 'Asia',
    'Malaysia': 'Asia',
    'Tajikistan': 'Asia',
    'South Korea': 'Asia',
    'Vietnam': 'Asia',
   
    # Europe
    'Greece': 'Europe',
    'Hungary': 'Europe',
    'Montenegro': 'Europe',
    'Ukraine': 'Europe',
    'Norway': 'Europe',
    'Czechia': 'Europe',
    'Poland': 'Europe',
    'Slovenia': 'Europe',
    'Spain': 'Europe',
    'Serbia': 'Europe',
    'Germany': 'Europe',
    'United Kingdom': 'Europe',
    'Bulgaria': 'Europe',
    'Sweden': 'Europe',
    'Portugal': 'Europe',
    'Romania': 'Europe',
    'North Macedonia': 'Europe',
    'Albania': 'Europe',
    'Georgia': 'Eurasia',
    'Cyprus': 'Europe',
    'Austria': 'Europe',
    'Bosnia & Herzegovina': 'Europe',
    'Finland': 'Europe',
    'Ireland': 'Europe',
   
    # North America
    'USA': 'North America',
    'Canada': 'North America',
    'Mexico': 'North America',
    'Dominican Republic': 'Caribbean',
    'Jamaica': 'Caribbean',
    'Panama': 'Central America',
    'Guatemala': 'Central America',
    'Cuba': 'Caribbean',
    'Honduras': 'Central America',
   
    # South America
    'Guyana': 'South America',
    'Brazil': 'South America',
    'Venezuela': 'South America',
    'Colombia': 'South America',
    'Argentina': 'South America',
    'Chile': 'South America',
    'Peru': 'South America',
    'Ecuador': 'South America',
    'Bolivia': 'South America',
   
    # Oceania
    'Australia': 'Oceania',
    'New Zealand': 'Oceania',
    'Papua New Guinea': 'Oceania',
    'New Caledonia': 'Oceania'
}

# Read the sheets we need
print("Reading data from Excel file...")
cumprod_df = pd.read_excel(file_path, sheet_name='CumProd')
rmr_df = pd.read_excel(file_path, sheet_name='RMR')

print(f"CumProd sheet: {len(cumprod_df)} rows")
print(f"RMR sheet: {len(rmr_df)} rows")

# Check available columns
print(f"CumProd columns: {list(cumprod_df.columns)}")
print(f"RMR columns: {list(rmr_df.columns)}")

# Find all commodities in the COMMODITIES_LIST
all_commodities = set()
for commodities_list in cumprod_df['COMMODITIES_LIST'].dropna():
    commodities = [c.strip() for c in str(commodities_list).split(',')]
    all_commodities.update(commodities)

# Add primary commodities as well
if 'PRIMARY_COMMODITY' in cumprod_df.columns:
    all_commodities.update(cumprod_df['PRIMARY_COMMODITY'].dropna())

print(f"Found {len(all_commodities)} unique commodities")

# Make sure all commodity columns exist in RMR sheet
for commodity in all_commodities:
    rmr_column = f"{commodity}_RMR"
    if rmr_column not in rmr_df.columns:
        rmr_df[rmr_column] = np.nan

# Create lookup for Primary Commodity by PROP_ID
primary_commodity_lookup = {}
for _, row in cumprod_df.iterrows():
    if 'PROP_ID' in row and 'PRIMARY_COMMODITY' in row and pd.notna(row['PRIMARY_COMMODITY']):
        primary_commodity_lookup[row['PROP_ID']] = row['PRIMARY_COMMODITY']

# Create lookup for commodities by PROP_ID
mine_commodities = {}
for _, row in cumprod_df.iterrows():
    prop_id = row['PROP_ID']
    commodities = []
    
    # Add primary commodity
    if 'PRIMARY_COMMODITY' in row and pd.notna(row['PRIMARY_COMMODITY']):
        commodities.append(row['PRIMARY_COMMODITY'])
    
    # Add commodities from COMMODITIES_LIST
    if 'COMMODITIES_LIST' in row and pd.notna(row['COMMODITIES_LIST']):
        commodities.extend([c.strip() for c in str(row['COMMODITIES_LIST']).split(',')])
    
    if commodities:
        mine_commodities[prop_id] = list(set(commodities))  # Remove duplicates

# Merge CumProd and RMR dataframes to get all necessary columns
# Ensure PROP_ID columns are integers
rmr_df['PROP_ID'] = rmr_df['PROP_ID'].astype(int)
cumprod_df['PROP_ID'] = cumprod_df['PROP_ID'].astype(int)

merged_df = rmr_df.merge(
    cumprod_df[['PROP_ID', 'PRIMARY_COMMODITY', 'COUNTRY_NAME', 'MINE_TYPE1']], 
    on='PROP_ID', 
    how='left'
)

# Check if columns got renamed during merge
if 'COUNTRY_NAME' not in merged_df.columns:
    if 'COUNTRY_NAME_y' in merged_df.columns:
        merged_df['COUNTRY_NAME'] = merged_df['COUNTRY_NAME_y']
    elif 'COUNTRY_NAME_x' in merged_df.columns:
        merged_df['COUNTRY_NAME'] = merged_df['COUNTRY_NAME_x']

if 'PRIMARY_COMMODITY' not in merged_df.columns:
    if 'PRIMARY_COMMODITY_y' in merged_df.columns:
        merged_df['PRIMARY_COMMODITY'] = merged_df['PRIMARY_COMMODITY_y']
    elif 'PRIMARY_COMMODITY_x' in merged_df.columns:
        merged_df['PRIMARY_COMMODITY'] = merged_df['PRIMARY_COMMODITY_x']

if 'MINE_TYPE1' not in merged_df.columns:
    if 'MINE_TYPE1_y' in merged_df.columns:
        merged_df['MINE_TYPE1'] = merged_df['MINE_TYPE1_y']
    elif 'MINE_TYPE1_x' in merged_df.columns:
        merged_df['MINE_TYPE1'] = merged_df['MINE_TYPE1_x']

# Add region column (handle NaN values)
merged_df['REGION'] = merged_df['COUNTRY_NAME'].fillna('').map(COUNTRY_TO_REGION)

def get_hierarchical_median(commodity, primary_commodity, country_name, region, mine_type, merged_df):
    """
    Get median RMR value using hierarchical imputation with minimum sample requirements:
    Level 1: primary commodity, country name, mine type (min 3 samples)
    Level 2: primary commodity, region, mine type (min 3 samples)
    Level 3: primary commodity, mine type (no minimum)
    """
    rmr_column = f"{commodity}_RMR"
    
    if rmr_column not in merged_df.columns:
        return np.nan, 'no_column'
    
    # Special handling for Lithium: treat Open Pit and Brine as the same mine type
    def create_mask(df, primary_comm, country, reg, mine_tp):
        base_mask = (df['PRIMARY_COMMODITY'] == primary_comm) & (pd.notna(df[rmr_column]))
        
        if primary_comm == 'Lithium':
            # For lithium, treat Open Pit and Brine as equivalent
            if mine_tp in ['Open Pit', 'Brine']:
                mine_type_mask = df['MINE_TYPE1'].isin(['Open Pit', 'Brine'])
            else:
                mine_type_mask = (df['MINE_TYPE1'] == mine_tp)
        else:
            mine_type_mask = (df['MINE_TYPE1'] == mine_tp)
        
        if country is not None:
            country_mask = (df['COUNTRY_NAME'] == country)
        else:
            country_mask = pd.Series([True] * len(df))
        
        if reg is not None:
            region_mask = (df['REGION'] == reg)
        else:
            region_mask = pd.Series([True] * len(df))
        
        return base_mask & mine_type_mask & country_mask & region_mask
    
    # Level 1: primary commodity, country name, mine type (minimum 3 samples)
    level1_mask = create_mask(merged_df, primary_commodity, country_name, None, mine_type)
    
    if level1_mask.sum() >= 3:
        median_val = merged_df.loc[level1_mask, rmr_column].median()
        if pd.notna(median_val):
            return median_val, 'level1'
    
    # Level 2: primary commodity, region, mine type (minimum 3 samples)
    if pd.notna(region):
        level2_mask = create_mask(merged_df, primary_commodity, None, region, mine_type)
        
        if level2_mask.sum() >= 3:
            median_val = merged_df.loc[level2_mask, rmr_column].median()
            if pd.notna(median_val):
                return median_val, 'level2'
    
    # Level 3: primary commodity, mine type (no minimum requirement)
    level3_mask = create_mask(merged_df, primary_commodity, None, None, mine_type)
    
    if level3_mask.sum() >= 3:
        median_val = merged_df.loc[level3_mask, rmr_column].median()
        if pd.notna(median_val):
            return median_val, 'level3'
    
    return np.nan, 'no_match'

# Apply hierarchical median imputation
values_added = 0
imputation_stats = {'level1': 0, 'level2': 0, 'level3': 0, 'no_match': 0}
sample_size_stats = {'level1_insufficient': 0, 'level2_insufficient': 0}

for idx, row in merged_df.iterrows():
    prop_id = row['PROP_ID']
    primary_commodity = row['PRIMARY_COMMODITY']
    country_name = row['COUNTRY_NAME']
    region = row['REGION']
    mine_type = row['MINE_TYPE1']
    
    if prop_id in mine_commodities:
        for commodity in mine_commodities[prop_id]:
            rmr_column = f"{commodity}_RMR"
            
            # Only impute if column exists and current value is empty
            if rmr_column in merged_df.columns and pd.isna(row[rmr_column]):
                # Check sample sizes for statistics
                level1_mask = (
                    (merged_df['PRIMARY_COMMODITY'] == primary_commodity) &
                    (merged_df['COUNTRY_NAME'] == country_name) &
                    (merged_df['MINE_TYPE1'] == mine_type) &
                    (pd.notna(merged_df[rmr_column]))
                )
                
                level2_mask = (
                    (merged_df['PRIMARY_COMMODITY'] == primary_commodity) &
                    (merged_df['REGION'] == region) &
                    (merged_df['MINE_TYPE1'] == mine_type) &
                    (pd.notna(merged_df[rmr_column]))
                ) if pd.notna(region) else pd.Series([False] * len(merged_df))
                
                # Track insufficient sample sizes
                if 0 < level1_mask.sum() < 3:
                    sample_size_stats['level1_insufficient'] += 1
                if 0 < level2_mask.sum() < 3:
                    sample_size_stats['level2_insufficient'] += 1
                
                imputed_value, level_used = get_hierarchical_median(
                    commodity, primary_commodity, country_name, region, mine_type, merged_df
                )
                
                if pd.notna(imputed_value):
                    merged_df.at[idx, rmr_column] = imputed_value
                    values_added += 1
                    imputation_stats[level_used] += 1
                else:
                    imputation_stats['no_match'] += 1

print(f"\nHierarchical median imputation completed!")
print(f"Total values imputed: {values_added}")
print(f"Level 1 (primary commodity + country + mine type, min 3 samples): {imputation_stats['level1']}")
print(f"Level 2 (primary commodity + region + mine type, min 3 samples): {imputation_stats['level2']}")
print(f"Level 3 (primary commodity + mine type, min 2 samples): {imputation_stats['level3']}")
print(f"No matches found: {imputation_stats['no_match']}")
print(f"\nSample size statistics:")
print(f"Level 1 had insufficient samples (<3): {sample_size_stats['level1_insufficient']}")
print(f"Level 2 had insufficient samples (<3): {sample_size_stats['level2_insufficient']}")

# Update the original RMR dataframe with imputed values
for commodity in all_commodities:
    rmr_column = f"{commodity}_RMR"
    if rmr_column in rmr_df.columns and rmr_column in merged_df.columns:
        rmr_df[rmr_column] = merged_df[rmr_column]

# Count mines with zero RMRs after imputation
mines_with_zero_rmr = 0
mines_with_some_rmr = 0
mines_with_all_rmr = 0
zero_rmr_mines = []

for idx, row in rmr_df.iterrows():
    prop_id = row['PROP_ID']
    
    if prop_id in mine_commodities:
        commodity_columns = [f"{c}_RMR" for c in mine_commodities[prop_id]]
        existing_columns = [col for col in commodity_columns if col in rmr_df.columns]
        
        if not existing_columns:
            continue
            
        values = [row[col] for col in existing_columns if pd.notna(row[col])]
        
        if not values:
            mines_with_zero_rmr += 1
            zero_rmr_mines.append(prop_id)
        elif len(values) == len(existing_columns):
            mines_with_all_rmr += 1
        else:
            mines_with_some_rmr += 1

print("\nMines RMR Status After Imputation:")
print(f"Mines with zero RMR values: {mines_with_zero_rmr}")
print(f"Mines with some RMR values: {mines_with_some_rmr}")
print(f"Mines with all RMR values: {mines_with_all_rmr}")
print(f"Total mines analyzed: {mines_with_zero_rmr + mines_with_some_rmr + mines_with_all_rmr}")

# Print remaining mines with zero RMR values
if zero_rmr_mines:
    print(f"\nRemaining mines with zero RMR values ({len(zero_rmr_mines)} mines):")
    print("=" * 60)
    print(f"{'PROP_ID':<10} | {'PRIMARY_COMMODITY':<20}")
    print("-" * 60)
    
    for prop_id in zero_rmr_mines:
        primary_commodity = primary_commodity_lookup.get(prop_id, "Unknown")
        print(f"{prop_id:<10} | {primary_commodity:<20}")
    
    print("=" * 60)

# Save to new file
print(f"\nSaving updated data to {output_file_path}...")

# Update the RMR sheet in the new file
with pd.ExcelWriter(output_file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    rmr_df.to_excel(writer, sheet_name='RMR', index=False)

print("Hierarchical median imputation completed successfully!")

Reading data from Excel file...
CumProd sheet: 598 rows
RMR sheet: 598 rows
CumProd columns: ['PROP_ID', 'PROP_NAME', 'PRIMARY_COMMODITY', 'COMMODITIES_LIST', 'COUNTRY_NAME', 'MINE_TYPE1', 'CumProd_Tonnes', 'Byproduct_Bauxite_Tonnes', 'Byproduct_Nickel_Tonnes', 'Byproduct_Copper_Tonnes', 'Byproduct_Gold_Tonnes', 'Byproduct_Silver_Tonnes', 'Byproduct_Lead_Tonnes', 'Byproduct_Zinc_Tonnes', 'Byproduct_Molybdenum_Tonnes', 'Byproduct_Iron Ore_Tonnes', 'Byproduct_Cadmium_Tonnes', 'Byproduct_Cobalt_Tonnes', 'Byproduct_Manganese_Tonnes', 'Byproduct_Gallium_Tonnes', 'Byproduct_Rhenium_Tonnes', 'Byproduct_Selenium_Tonnes', 'Byproduct_Rhodium_Tonnes', 'Byproduct_Tungsten_Tonnes', 'Byproduct_Lanthanides_Tonnes', 'Byproduct_Platinum_Tonnes', 'Byproduct_Palladium_Tonnes', 'Byproduct_Titanium_Tonnes', 'Byproduct_Vanadium_Tonnes', 'Byproduct_Tin_Tonnes', 'Byproduct_Lithium_Tonnes', 'Byproduct_Tantalum_Tonnes', 'Byproduct_Niobium_Tonnes', 'Byproduct_Magnesium_Tonnes', 'Byproduct_Tellurium_Tonnes', 'Byp

RMR x CumProd of byproducts and primary commodity

1) multiply the RMR's found in sheet RMR by the Cumprod found in sheet CumProd
2) save to file path Data Input/Commodity Production/Rock/Total 


In [30]:
import pandas as pd
import numpy as np

# File path
file_path = "Data Input/Byproduct/Byproducts_CumProd_final.xlsx"

print("Reading data from Excel file...")
# Read all required sheets
rmr_df = pd.read_excel(file_path, sheet_name='RMR')
cumprod_df = pd.read_excel(file_path, sheet_name='CumProd')

# Check if Total_Rock sheet exists, if not create it
try:
    total_rock_df = pd.read_excel(file_path, sheet_name='Total_Rock')
    print("Found existing Total_Rock sheet.")
except:
    print("Total_Rock sheet not found. Creating new sheet.")
    # Create Total_Rock sheet with PROP_ID column
    total_rock_df = pd.DataFrame()
    total_rock_df['PROP_ID'] = rmr_df['PROP_ID'].copy()

print(f"RMR sheet: {len(rmr_df)} rows")
print(f"CumProd sheet: {len(cumprod_df)} rows")
print(f"Total_Rock sheet: {len(total_rock_df)} rows")

# Find all commodities with RMR values
rmr_columns = [col for col in rmr_df.columns if col.endswith('_RMR')]
unique_commodities = [col.replace('_RMR', '') for col in rmr_columns]

print(f"Found {len(unique_commodities)} commodities with RMR values")

# Process each mine
commodities_processed = 0
total_calculations = 0

# Create lookup dictionaries for easier access
rmr_lookup = {}
for _, row in rmr_df.iterrows():
    prop_id = row['PROP_ID']
    rmr_lookup[prop_id] = {}
    for commodity in unique_commodities:
        rmr_col = f"{commodity}_RMR"
        if rmr_col in rmr_df.columns and pd.notna(row.get(rmr_col)):
            rmr_lookup[prop_id][commodity] = row[rmr_col]

# Find primary commodity and production for each mine
primary_lookup = {}
cumprod_lookup = {}
for _, row in cumprod_df.iterrows():
    prop_id = row['PROP_ID']
    
    # Store primary commodity and its production
    if 'PRIMARY_COMMODITY' in row and 'CumProd_Tonnes' in row:
        primary_commodity = row['PRIMARY_COMMODITY']
        primary_production = row['CumProd_Tonnes']
        
        if pd.notna(primary_commodity) and pd.notna(primary_production):
            primary_lookup[prop_id] = primary_commodity
            
            if prop_id not in cumprod_lookup:
                cumprod_lookup[prop_id] = {}
            
            cumprod_lookup[prop_id][primary_commodity] = primary_production
    
    # Store byproduct production
    byproduct_columns = [col for col in row.index if col.startswith('Byproduct_') and col.endswith('_Tonnes')]
    for col in byproduct_columns:
        byproduct_value = row[col]
        if pd.notna(byproduct_value):
            byproduct_name = col.replace('Byproduct_', '').replace('_Tonnes', '')
            
            if prop_id not in cumprod_lookup:
                cumprod_lookup[prop_id] = {}
            
            cumprod_lookup[prop_id][byproduct_name] = byproduct_value

# Calculate rock tonnage for each commodity at each mine
for prop_id in total_rock_df['PROP_ID']:
    if prop_id in rmr_lookup and prop_id in cumprod_lookup:
        # Get RMR values for this mine
        mine_rmr = rmr_lookup[prop_id]
        
        # Get cumulative production values for this mine
        mine_cumprod = cumprod_lookup[prop_id]
        
        # Calculate rock tonnage for each commodity
        for commodity, production in mine_cumprod.items():
            if commodity in mine_rmr:
                rmr_value = mine_rmr[commodity]
                
                # Calculate rock tonnage
                rock_tonnage = production * rmr_value
                
                # Create column if it doesn't exist
                rock_column = f"{commodity}_Rock"
                if rock_column not in total_rock_df.columns:
                    total_rock_df[rock_column] = np.nan
                
                # Get index for this mine
                mine_idx = total_rock_df[total_rock_df['PROP_ID'] == prop_id].index[0]
                
                # Set the rock tonnage value
                total_rock_df.at[mine_idx, rock_column] = rock_tonnage
                
                total_calculations += 1
                
        commodities_processed += 1

print(f"Processed {commodities_processed} mines")
print(f"Calculated {total_calculations} commodity rock tonnages")

# Save the updated Total_Rock sheet
print("Saving Total_Rock sheet...")
with pd.ExcelWriter(file_path, mode='a', if_sheet_exists='replace', engine='openpyxl') as writer:
    total_rock_df.to_excel(writer, sheet_name='Total_Rock', index=False)

print("Total_Rock sheet updated successfully!")

Reading data from Excel file...
Found existing Total_Rock sheet.
RMR sheet: 598 rows
CumProd sheet: 598 rows
Total_Rock sheet: 598 rows
Found 56 commodities with RMR values
Processed 598 mines
Calculated 1250 commodity rock tonnages
Saving Total_Rock sheet...
Total_Rock sheet updated successfully!
