In [14]:
import csv
import random
import os
import pandas as pd

raw_data_folder = "E:\CP_raw_data"
processed_data_folder = "E:\CP_processed_data"

#Testing
LABELS = ["AlFe-Silicate", "Fe-Sulphide"]

In [15]:
LABELS = [ 
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole", 
    "AlFe-Silicate", "Fe-Silicate", "Titanite", "AlMn-Silicate", 
    "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite", 
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite", 
    "Leucoxene", "Fe-Sulphide", "AlCa-Silicate", "Apatite", 
    "Xenotime", "Carbonate", "Thorite", "AlFe-Silicate_Zn", 
    "Al-Oxide", "Fe-Sulphide_Cu", "W-Oxide", "Zn-Oxide", 
    "Fe-sulphide_Pb", "Celestine", "Mn-Oxide", "Columbite_Mn", 
    "Cu-Oxide", "Nb-bearing phase", "REE-Silicate", "Molybdenite"]

In [17]:
big_file_threshold = 200 * 1024 * 1024  # 100,000 KB in bytes
csv.field_size_limit(1000000000)  # or another appropriate limit

header_columns = ["x", "y", "yinv"] + [f"CH_{i}" for i in range(1, 4097)] + ["Sum", "Mineral Phase"]

def is_valid_row(row, label):
    return len(row) == 4101

for label in LABELS:
    raw_data_path = os.path.join(raw_data_folder, f"{label}_samples.csv")
    processed_data_path = os.path.join(processed_data_folder, f"{label}.csv")
    
    if not os.path.exists(raw_data_path):
        print(f"[ERROR] No file found for label: {label}")
        continue

    # Determine the chunk size and total sample size based on file size
    file_size = os.path.getsize(raw_data_path)
    chunk_size = file_size // 10

    if file_size > big_file_threshold:
        total_samples = 22000
        samples_per_chunk = total_samples // 10
    else:
        total_samples = 12000
        samples_per_chunk = total_samples // 10

    print(f"\n[INFO] Starting processing for label: {label}")
    print(f"[INFO] File size: {file_size / (1024*1024):.2f} MB")
    print(f"[INFO] Chunk size: {chunk_size / (1024*1024):.2f} MB")
    
    total_valid_rows = 0
    total_sampled_rows = 0
    chunk_count = 0

    with open(raw_data_path, 'r') as infile, open(processed_data_path, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        header = next(reader)
        writer.writerow(header_columns)  # Write the header

        valid_rows = []
        read_size = 0
        end_of_file = False

        for row in reader:
            read_size += sum(len(field) for field in row) + len(row)
            
            if is_valid_row(row, label):
                valid_rows.append(row)
                
            if read_size >= chunk_size or end_of_file:
                chunk_count += 1
                current_sample_size = min(samples_per_chunk, len(valid_rows))
                sampled_rows = random.sample(valid_rows, current_sample_size)

                writer.writerows(sampled_rows)  # Write sampled rows chunk by chunk
                
                total_valid_rows += len(valid_rows)
                total_sampled_rows += len(sampled_rows)

                print(f"[INFO] Chunk {chunk_count}: Sampled {len(sampled_rows)} out of {len(valid_rows)} valid rows.")
                
                valid_rows.clear()
                read_size = 0

                # Stop sampling if we've reached our total sampling goal
                if total_sampled_rows >= total_samples:
                    break

            if read_size >= chunk_size:
                end_of_file = True
        
        # Handle the case where the total number of rows is below 10,000
        if total_valid_rows < 10000:
            print(f"[INFO] File has less than 10,000 valid rows. Saving all {total_valid_rows} valid rows.")
            writer.writerows(valid_rows)

    print(f"[SUMMARY] For label {label}: out of {total_valid_rows} valid rows, {total_sampled_rows} rows were sampled.")
    print(f"[INFO] Sampled data saved for label: {label}\n")

print("[DONE] All files processed.")


[INFO] Starting processing for label: Fe-Oxide
[INFO] File size: 20929.98 MB
[INFO] Chunk size: 2093.00 MB
[INFO] Chunk 1: Sampled 2200 out of 222341 valid rows.
[INFO] Chunk 2: Sampled 2200 out of 234467 valid rows.
[INFO] Chunk 3: Sampled 2200 out of 211133 valid rows.
[INFO] Chunk 4: Sampled 2200 out of 206865 valid rows.
[INFO] Chunk 5: Sampled 2200 out of 204500 valid rows.
[INFO] Chunk 6: Sampled 2200 out of 243593 valid rows.
[INFO] Chunk 7: Sampled 2200 out of 236508 valid rows.
[INFO] Chunk 8: Sampled 2200 out of 229893 valid rows.
[INFO] Chunk 9: Sampled 2200 out of 229777 valid rows.
[SUMMARY] For label Fe-Oxide: out of 2019077 valid rows, 19800 rows were sampled.
[INFO] Sampled data saved for label: Fe-Oxide


[INFO] Starting processing for label: Background
[INFO] File size: 11009.23 MB
[INFO] Chunk size: 1100.92 MB
[INFO] Chunk 1: Sampled 2200 out of 126219 valid rows.
[INFO] Chunk 2: Sampled 2200 out of 129840 valid rows.
[INFO] Chunk 3: Sampled 2200 out of 120996 valid

In [18]:
#Path directory
combined_data_folder = "E:\CP_raw_data"
label_data_folder = "E:\CP_processed_data"

# Create the desired header structure
desired_header = ["x", "y", "yinv"] + [f"CH_{i}" for i in range(1, 4097)] + ["Sum", "Mineral Phase"]

In [19]:
# Combine samples
for idx in range(1, 79):  # For combined_data_1.csv to combined_data_78.csv
    combined_data_path = os.path.join(combined_data_folder, f"combined_data_{idx}.csv")

    with open(combined_data_path, 'r') as infile:
        reader = csv.reader(infile)
        header = next(reader)

        # Check if columns are misordered and rearrange them
        if header != desired_header:
            print(f"[INFO] Reordering columns for combined_data_{idx}.csv")
            data = list(reader)
            col_indices = [header.index(col_name) for col_name in desired_header]
            rearranged_data = [[row[i] for i in col_indices] for row in data]
        else:
            rearranged_data = list(reader)
            print(f"[INFO] Columns in combined_data_{idx}.csv are in correct order")

        appended_counts = {}  # To track how many samples were appended for each label

        # Append rows to respective {label}.csv
        for row in rearranged_data:
            label = row[-1]  # Assuming Mineral Phase is the last column
            label_file_path = os.path.join(label_data_folder, f"{label}.csv")
            
            with open(label_file_path, 'a', newline='') as label_file:
                writer = csv.writer(label_file)
                writer.writerow(row)

            # Update counts
            appended_counts[label] = appended_counts.get(label, 0) + 1

        # Report how many samples were appended for each label
        for label, count in appended_counts.items():
            print(f"[INFO] Appended {count} samples from combined_data_{idx}.csv to {label}.csv")

    print(f"[INFO] Finished processing combined_data_{idx}.csv")
    print("-" * 50)

# Display the total sample number in each file
print("\n[SUMMARY]")
for filename in os.listdir(label_data_folder):
    file_path = os.path.join(label_data_folder, filename)
    with open(file_path, 'r') as f:
        total_samples = sum(1 for line in f) - 1  # -1 to exclude the header
        print(f"For {filename}: Total samples = {total_samples}")

print("\n[DONE] All files processed.")

[INFO] Columns in combined_data_1.csv are in correct order
[INFO] Appended 89 samples from combined_data_1.csv to AlFe-Silicate.csv
[INFO] Appended 145 samples from combined_data_1.csv to Fe-Silicate.csv
[INFO] Appended 122 samples from combined_data_1.csv to AlK-Silicate.csv
[INFO] Appended 108 samples from combined_data_1.csv to Zn-Oxide.csv
[INFO] Appended 139 samples from combined_data_1.csv to Quartz.csv
[INFO] Appended 23 samples from combined_data_1.csv to AlFe-Silicate_Zn.csv
[INFO] Appended 132 samples from combined_data_1.csv to Rutile.csv
[INFO] Appended 62 samples from combined_data_1.csv to Epidote.csv
[INFO] Appended 66 samples from combined_data_1.csv to Amphibole.csv
[INFO] Appended 176 samples from combined_data_1.csv to Titanite.csv
[INFO] Appended 102 samples from combined_data_1.csv to AlMn-Silicate.csv
[INFO] Appended 97 samples from combined_data_1.csv to Monazite.csv
[INFO] Appended 51 samples from combined_data_1.csv to Zircon.csv
[INFO] Appended 5 samples from 