Merge area of mine polygons with mines (do this step before byproduct analysis)
1) merge area based on cluster id and drop mines that have no area value --> about 63 mines 
2) sum up all polygon areas associated to a single mine
3) about 30% of mine share an id cluster with atlease one other mine

In [5]:
import pandas as pd
import numpy as np

def merge_area_data():
    # Define file paths
    input_file = "Data Input/Maus_Spa_Data/polygon_attribute_table_all.xlsx"
    output_file = "Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx"
    
    # Read both Excel files
    print(f"Reading data from {input_file}...")
    area_df = pd.read_excel(input_file)
    
    print(f"Reading data from {output_file}...")
    mines_df = pd.read_excel(output_file)
    
    # Check if required columns exist
    if 'id_cluster' not in area_df.columns or 'area_mine' not in area_df.columns:
        raise ValueError("Missing required columns in input dataset")
    
    if 'PROP_ID' not in mines_df.columns or 'id_cluster' not in mines_df.columns:
        raise ValueError("Missing required columns in output dataset")
    
    # Create a new dataframe to store the expanded results
    result_rows = []
    
    # For each mine, create multiple rows if needed
    print("Processing mines and their associated areas...")
    for _, mine_row in mines_df.iterrows():
        mine_data = mine_row.to_dict()
        cluster_id = mine_data['id_cluster']
        
        # Find all matching areas for this cluster_id
        matching_areas = area_df[area_df['id_cluster'] == cluster_id]
        
        if len(matching_areas) > 0:
            # Create a new row for each matching area
            for _, area_row in matching_areas.iterrows():
                new_row = mine_data.copy()
                new_row['area_total'] = area_row['area_mine']
                result_rows.append(new_row)
        else:
            # Add the mine with empty area_total
            mine_data['area_total'] = np.nan
            result_rows.append(mine_data)
    
    # Convert list of dictionaries to DataFrame
    result_df = pd.DataFrame(result_rows)
    
    # Remove rows with no area values
    print("Removing rows with no area values...")
    result_df_filtered = result_df.dropna(subset=['area_total'])
    
    removed_count = len(result_df) - len(result_df_filtered)
    print(f"Removed {removed_count} rows with no area values")
    
    # Save the result
    output_path = "Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx"
    print(f"Saving results to {output_path}...")
    result_df_filtered.to_excel(output_path, index=False)
    
    # Print statistics
    original_mines = len(mines_df)
    final_rows = len(result_df_filtered)
    print(f"Original number of mines: {original_mines}")
    print(f"Final number of rows with area values: {final_rows}")
    print(f"Process completed successfully")

if __name__ == "__main__":
    try:
        merge_area_data()
    except Exception as e:
        print(f"Error: {e}")

Reading data from Data Input/Maus_Spa_Data/polygon_attribute_table_all.xlsx...
Reading data from Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx...
Processing mines and their associated areas...
Removing rows with no area values...
Removed 60 rows with no area values
Saving results to Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx...
Original number of mines: 598
Final number of rows with area values: 11902
Process completed successfully


In [7]:
import pandas as pd

def sum_areas_by_mine():
    # Define file paths
    input_file = "Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx"
    
    # Read the file with area data
    print(f"Reading data from {input_file}...")
    df = pd.read_excel(input_file)
    
    # Check if required columns exist
    required_cols = ['PROP_ID', 'area_total']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in input file: {', '.join(missing_cols)}")
    
    # Group by PROP_ID and sum the area values
    print("Calculating total area for each mine...")
    # Sum the areas for each mine (PROP_ID)
    area_sums = df.groupby('PROP_ID')['area_total'].sum().reset_index()
    
    # Get one row per mine with all original columns (except area_total)
    other_cols = [col for col in df.columns if col != 'area_total']
    unique_mines = df[other_cols].drop_duplicates(subset=['PROP_ID'])
    
    # Merge the summed areas back to the unique mines
    result_df = pd.merge(
        unique_mines,
        area_sums,
        on='PROP_ID',
        how='left'
    )
    
    # Save the result
    output_path = "Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx"
    print(f"Saving results to {output_path}...")
    result_df.to_excel(output_path, index=False)
    
    # Print statistics
    print(f"Processed {len(df)} rows into {len(result_df)} unique mines")
    print("Process completed successfully")

if __name__ == "__main__":
    try:
        sum_areas_by_mine()
    except Exception as e:
        print(f"Error: {e}")

Reading data from Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx...
Calculating total area for each mine...
Saving results to Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx...
Processed 11902 rows into 538 unique mines
Process completed successfully


In [9]:
import pandas as pd
import numpy as np

def analyze_mine_clusters(excel_path):
    """
    Analyze the clustering patterns of mines in the dataset.
    
    Parameters:
    -----------
    excel_path : str
        Path to the Excel file containing the final output data
    """
    # Read the Excel file
    print(f"Reading data from {excel_path}...")
    df = pd.read_excel(excel_path)
    print(f"Found {len(df)} rows in the dataset")
    
    # Verify required columns exist
    if 'PROP_ID' not in df.columns:
        raise ValueError(f"Column 'PROP_ID' not found in the file. Available columns: {df.columns.tolist()}")
    if 'id_cluster' not in df.columns:
        raise ValueError(f"Column 'id_cluster' not found in the file. Available columns: {df.columns.tolist()}")
    
    # Ensure ID columns are strings
    df['PROP_ID'] = df['PROP_ID'].astype(str)
    df['id_cluster'] = df['id_cluster'].astype(str)
    
    # Count total unique mines
    total_mines = df['PROP_ID'].nunique()
    print(f"\nTotal number of unique mines (PROP_IDs): {total_mines}")
    
    # Count total unique clusters
    total_clusters = df['id_cluster'].nunique()
    print(f"Total number of unique clusters: {total_clusters}")
    
    # 1. Find mines sharing clusters with other mines
    # Group by cluster and count unique mines in each cluster
    cluster_counts = df.groupby('id_cluster')['PROP_ID'].nunique().reset_index()
    cluster_counts.columns = ['id_cluster', 'mine_count']
    
    # Clusters with more than one mine
    shared_clusters = cluster_counts[cluster_counts['mine_count'] > 1]
    
    # Get unique mines in shared clusters
    mines_in_shared_clusters = df[df['id_cluster'].isin(shared_clusters['id_cluster'])]['PROP_ID'].unique()
    num_mines_sharing_clusters = len(mines_in_shared_clusters)
    
    print(f"\n1. Mines sharing clusters with other mines:")
    print(f"   Number of clusters shared by multiple mines: {len(shared_clusters)}")
    print(f"   Number of mines that share clusters with other mines: {num_mines_sharing_clusters} "
          f"({(num_mines_sharing_clusters/total_mines)*100:.2f}% of all mines)")
    
    # 2. Find mines with multiple clusters assigned
    # Count clusters per mine
    mine_cluster_counts = df.groupby('PROP_ID')['id_cluster'].nunique().reset_index()
    mine_cluster_counts.columns = ['PROP_ID', 'cluster_count']
    
    # Mines with more than one cluster
    multi_cluster_mines = mine_cluster_counts[mine_cluster_counts['cluster_count'] > 1]
    num_multi_cluster_mines = len(multi_cluster_mines)
    
    print(f"\n2. Mines with multiple cluster assignments:")
    print(f"   Number of mines assigned to multiple clusters: {num_multi_cluster_mines} "
          f"({(num_multi_cluster_mines/total_mines)*100:.2f}% of all mines)")
    
    if num_multi_cluster_mines > 0:
        avg_clusters_per_mine = multi_cluster_mines['cluster_count'].mean()
        max_clusters_per_mine = multi_cluster_mines['cluster_count'].max()
        print(f"   Average number of clusters per mine (for mines with multiple clusters): {avg_clusters_per_mine:.2f}")
        print(f"   Maximum number of clusters assigned to a single mine: {max_clusters_per_mine}")
    
    # 3. Find mines that both have multiple clusters AND share clusters with others
    # Get list of mines with multiple clusters
    multi_cluster_mine_ids = multi_cluster_mines['PROP_ID'].unique()
    
    # Check which of these also appear in the shared clusters list
    complex_mines = np.intersect1d(multi_cluster_mine_ids, mines_in_shared_clusters)
    num_complex_mines = len(complex_mines)
    
    print(f"\n3. Complex clustering patterns:")
    print(f"   Mines that both have multiple clusters AND share clusters with other mines: {num_complex_mines} "
          f"({(num_complex_mines/total_mines)*100:.2f}% of all mines)")
    
    # Overall summary
    print(f"\n--- Summary ---")
    print(f"Total mines: {total_mines}")
    print(f"Total clusters: {total_clusters}")
    print(f"Mines with simple clustering (one cluster, not shared): "
          f"{total_mines - num_mines_sharing_clusters - num_multi_cluster_mines + num_complex_mines} "
          f"({(total_mines - num_mines_sharing_clusters - num_multi_cluster_mines + num_complex_mines)/total_mines*100:.2f}%)")
    print(f"Mines sharing clusters with others (but having only one cluster): "
          f"{num_mines_sharing_clusters - num_complex_mines} "
          f"({(num_mines_sharing_clusters - num_complex_mines)/total_mines*100:.2f}%)")
    print(f"Mines with multiple clusters (but not sharing with others): "
          f"{num_multi_cluster_mines - num_complex_mines} "
          f"({(num_multi_cluster_mines - num_complex_mines)/total_mines*100:.2f}%)")
    print(f"Complex mines (multiple clusters AND sharing with others): "
          f"{num_complex_mines} ({(num_complex_mines/total_mines)*100:.2f}%)")

# Example usage
if __name__ == "__main__":
    excel_path = "Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx"
    
    try:
        analyze_mine_clusters(excel_path)
    except Exception as e:
        print(f"Error: {e}")

Reading data from Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx...
Found 538 rows in the dataset

Total number of unique mines (PROP_IDs): 538
Total number of unique clusters: 459

1. Mines sharing clusters with other mines:
   Number of clusters shared by multiple mines: 44
   Number of mines that share clusters with other mines: 123 (22.86% of all mines)

2. Mines with multiple cluster assignments:
   Number of mines assigned to multiple clusters: 0 (0.00% of all mines)

3. Complex clustering patterns:
   Mines that both have multiple clusters AND share clusters with other mines: 0 (0.00% of all mines)

--- Summary ---
Total mines: 538
Total clusters: 459
Mines with simple clustering (one cluster, not shared): 415 (77.14%)
Mines sharing clusters with others (but having only one cluster): 123 (22.86%)
Mines with multiple clusters (but not sharing with others): 0 (0.00%)
Complex mines (multiple clusters AND sharing with others): 0 (0.00%)


In [13]:
import pandas as pd
from pathlib import Path

# Paths
byproducts_path = Path("Data Input/Byproduct/Byproducts_CumProd_final.xlsx")
prod_path = Path("Data Output/Commodity Production/Commodity_Production-1980_2023_final.xlsx")
out_path = Path("Data Input/Byproduct/Total_Rock.xlsx")  

# Read inputs
total_rock = pd.read_excel(byproducts_path, sheet_name="Total_Rock")
prod = pd.read_excel(prod_path)

# Normalize column name just in case
if "PROP_ID" not in total_rock.columns and "Prop_id" in total_rock.columns:
    total_rock = total_rock.rename(columns={"Prop_id": "PROP_ID"})
if "PROP_ID" not in prod.columns and "Prop_id" in prod.columns:
    prod = prod.rename(columns={"Prop_id": "PROP_ID"})

# Get the set of PROP_IDs from the second dataset
prop_ids = prod["PROP_ID"].dropna().astype(str).unique()

# Filter Total_Rock by those PROP_IDs
filtered = total_rock[total_rock["PROP_ID"].astype(str).isin(prop_ids)]

# Save output (keep the same sheet name)
with pd.ExcelWriter(out_path, engine="openpyxl") as writer:
    filtered.to_excel(writer, index=False, sheet_name="Total_Rock")

print(f"Saved {len(filtered):,} rows to {out_path}")


Saved 538 rows to Data Input\Byproduct\Total_Rock.xlsx


In [1]:
#!/usr/bin/env python3
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import shutil

# --- Paths ---
xlsx_path = Path("Data Output") / "Commodity Production" / "Commodity_Production-1980_2023_final.xlsx"

# --- Load data ---
df = pd.read_excel(xlsx_path)

# --- Required columns ---
required = {"PROP_ID", "id_cluster", "area_total", "Total Rock"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {sorted(missing)}")

# --- Coerce numerics (tolerant to strings/commas) ---
df["area_total"] = pd.to_numeric(df["area_total"], errors="coerce")
df["Total Rock"] = pd.to_numeric(df["Total Rock"], errors="coerce")

# --- Cluster totals ---
# If area_total is truly cluster-level, it should be the same across the cluster.
# We'll take the first non-null per cluster to be safe.
cluster_total_rock = df.groupby("id_cluster")["Total Rock"].transform("sum")
cluster_area_total = df.groupby("id_cluster")["area_total"].transform(
    lambda s: s.dropna().iloc[0] if s.dropna().size else np.nan
)

# --- Compute allocation ---
# Ratio = mine_total_rock / cluster_total_rock
# Allocated area = Ratio * cluster_area_total
# Guard against division by zero or missing totals.
ratio = np.where(cluster_total_rock > 0, df["Total Rock"] / cluster_total_rock, np.nan)
df["area_allocated"] = ratio * cluster_area_total

# --- (Optional) sanity check: sums per cluster should match cluster area_total (within tolerance) ---
# You can comment this out if you don't want console output.
check = (
    df.groupby("id_cluster")
      .agg(
          area_total_first=("area_total", lambda s: s.dropna().iloc[0] if s.dropna().size else np.nan),
          area_allocated_sum=("area_allocated", "sum")
      )
)
mismatch = (check["area_total_first"] - check["area_allocated_sum"]).abs()
bad = mismatch[mismatch > 1e-6]
if not bad.empty:
    print("Warning: for these clusters, allocated sum != area_total (>|1e-6|):")
    print(bad)

# --- Backup and save ---
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
backup_path = xlsx_path.with_name(xlsx_path.stem + f"-backup-{timestamp}" + xlsx_path.suffix)
try:
    shutil.copy2(xlsx_path, backup_path)
    print(f"Backup created: {backup_path}")
except Exception as e:
    print(f"Backup skipped (non-fatal): {e}")

# Overwrite the original file with the new column
df.to_excel(xlsx_path, index=False)
print(f"Updated file with 'area_allocated': {xlsx_path}")


Backup created: Data Output\Commodity Production\Commodity_Production-1980_2023_final-backup-20250911-204627.xlsx
Updated file with 'area_allocated': Data Output\Commodity Production\Commodity_Production-1980_2023_final.xlsx
