In [None]:
import pandas as pd
import zipfile
import os
import gc
import random

# Define constants
LABELS = [
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole", 
    "AlFe-Silicate", "Fe-Silicate", "Titanite", "AlMn-Silicate", 
    "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite", 
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite", 
    "Leucoxene", "Fe-Sulphide", "AlCa-Silicate", "Apatite", 
    "Xenotime", "Carbonate", "Thorite", "AlFe-Silicate_Zn", 
    "Al-Oxide", "Fe-Sulphide_Cu", "W-Oxide", "Zn-Oxide", 
    "Fe-sulphide_Pb", "Celestine", "Mn-Oxide", "Columbite_Mn", 
    "Cu-Oxide", "Nb-bearing phase", "REE-Silicate", "Molybdenite"
]

TOTAL_SAMPLES = {
    "Fe-Oxide": 20883184,
    "Background": 11347230,
    "Ilmenite": 8400629,
    "Epidote": 5195663,
    "Amphibole": 4404669,
    "AlFe-Silicate": 4106622,
    "Fe-Silicate": 1784892,
    "Titanite": 1512827,
    "AlMn-Silicate": 1326232,
    "Rutile": 876502,
    "Zircon": 618053,
    "Monazite": 408111,
    "Quartz": 404580,
    "Cassiterite": 371466,
    "AlK-Silicate": 325675,
    "Barite": 281363,
    "Al-Silicate": 272677,
    "Chromite": 266954,
    "Leucoxene": 150346,
    "Fe-Sulphide": 146294,
    "AlCa-Silicate": 125414,
    "Apatite": 92279,
    "Xenotime": 66880,
    "Carbonate": 62444,
    "Thorite": 48635,
    "AlFe-Silicate_Zn": 34112,
    "Al-Oxide": 33373,
    "Fe-Sulphide_Cu": 30730,
    "W-Oxide": 28626,
    "Zn-Oxide": 27213,
    "Fe-sulphide_Pb": 26963,
    "Celestine": 26574,
    "Mn-Oxide": 25292,
    "Columbite_Mn": 23806,
    "Cu-Oxide": 19951,
    "Nb-bearing phase": 14705,
    "REE-Silicate": 13791,
    "Molybdenite": 12023
}

desired_samples_per_label = 20000
chunk_size = 800

def dynamic_max_samples(label):
    return round(chunk_size * (desired_samples_per_label / TOTAL_SAMPLES[label]))

def save_to_file(accumulated_df, file_num):
    output_file = os.path.join("D:", "data", f"combined_data_{file_num}.csv")
    accumulated_df.to_csv(output_file, index=False)
    return file_num + 1

def process_chunk(chunk, label_counts):
    samples_to_add = []
    for label in LABELS:
        if label_counts[label] >= desired_samples_per_label:
            continue
        label_rows = chunk[chunk['Mineral Phase'] == label]
        max_samples_for_label = min(dynamic_max_samples(label), desired_samples_per_label - label_counts[label])
        samples = label_rows.sample(n=min(max_samples_for_label, len(label_rows)), replace=False)
        samples_to_add.extend(samples.to_dict('records'))
        label_counts[label] += len(samples)
    return samples_to_add

accumulated_data = []
label_counts = {label: 0 for label in LABELS}
file_num = 1

zip_file_path = r"C:\Users\CH Counts Data.zip"
with zipfile.ZipFile(zip_file_path, 'r') as z:
    csv_files = [f for f in z.namelist() if f.endswith(".csv")]
    
    # Find the starting file within the list.
    starting_file = next((file for file in csv_files if "Group8_0001.csv" in file), None)
    if starting_file:
        csv_files.remove(starting_file)
        random.shuffle(csv_files)
        csv_files.insert(0, starting_file)
    else:
        print("Warning: Group8_0001.csv not found in the archive. Processing in random order.")
        random.shuffle(csv_files)
        
    for idx, file_name in enumerate(csv_files):
        print(f"Processing file {idx+1}/{len(csv_files)}: {file_name}")

        previous_counts = label_counts.copy()

        with z.open(file_name) as file:
            chunk_iter = pd.read_csv(file, chunksize=chunk_size)
            for chunk in chunk_iter:
                accumulated_data.extend(process_chunk(chunk, label_counts))
                
                if len(accumulated_data) >= 2000:
                    accumulated_df = pd.DataFrame(accumulated_data)
                    file_num = save_to_file(accumulated_df, file_num)
                    accumulated_data = []
                    del accumulated_df
                    gc.collect()

                if all(count >= desired_samples_per_label for count in label_counts.values()):
                    break

            # Additional clean-up after each file
            del chunk_iter
            gc.collect()

            for label, count in label_counts.items():
                if previous_counts[label] != count:
                    print(f"Gathered {count} samples for label: {label}")

            if all(count >= desired_samples_per_label for count in label_counts.values()):
                break

if accumulated_data:
    accumulated_df = pd.DataFrame(accumulated_data)
    save_to_file(accumulated_df, file_num)
    del accumulated_df
    gc.collect()

print("Processing completed.")

Processing file 1/8327: CH Counts Data/Group8_0001.csv
Gathered 16 samples for label: AlFe-Silicate
Gathered 30 samples for label: Fe-Silicate
Gathered 1 samples for label: Rutile
Gathered 10 samples for label: Quartz
Gathered 28 samples for label: AlK-Silicate
Gathered 8 samples for label: AlFe-Silicate_Zn
Gathered 107 samples for label: Zn-Oxide
Processing file 2/8327: CH Counts Data/Group8_6138.csv
Gathered 16 samples for label: Epidote
Gathered 16 samples for label: Amphibole
Gathered 32 samples for label: AlFe-Silicate
Gathered 62 samples for label: Fe-Silicate
Gathered 47 samples for label: Titanite
Gathered 40 samples for label: AlMn-Silicate
Gathered 12 samples for label: Rutile
Gathered 29 samples for label: Zircon
Gathered 87 samples for label: Monazite
Gathered 14 samples for label: Quartz
Gathered 41 samples for label: AlK-Silicate
Gathered 4 samples for label: Al-Silicate
Gathered 2 samples for label: Chromite
Gathered 6 samples for label: Leucoxene
Gathered 3 samples for 

In [8]:
#testing to save file
import pandas as pd
import os

# Sample dataframe
df = pd.DataFrame({
    'Name': ['John', 'Anna', 'Mike'],
    'Age': [28, 22, 32],
    'City': ['New York', 'Paris', 'Berlin']
})

# Ensure directory exists
directory = os.path.join("D:", "data")
if not os.path.exists(directory):
    os.makedirs(directory)

# Save to D:\data\test.csv
output_path = os.path.join(directory, "test.csv")
df.to_csv(output_path, index=False)

print(f"File saved to {output_path}")

File saved to D:data\test.csv


In [7]:
import pandas as pd
import zipfile
import os

# Define constants
LABEL = "Fe-Oxide"
desired_samples_per_label = 1000
chunk_size = 500  # Reading 500 rows at a time
max_samples_per_chunk = 5  # Gathering 5 samples at most from each chunk

def save_to_file(accumulated_df, file_num):
    output_file = os.path.join("C:\\", f"combined_data_{file_num}.csv")
    accumulated_df.to_csv(output_file, index=False)
    print(f"Saved to {output_file}")
    return file_num + 1

accumulated_data = []
label_counts = {LABEL: 0}
file_num = 1

zip_file_path = r"C:\Users\CH Counts Data.zip"
with zipfile.ZipFile(zip_file_path, 'r') as z:
    total_files = sum(1 for file_name in z.namelist() if file_name.endswith(".csv"))
    current_file = 0

    for file_name in z.namelist():
        if not file_name.endswith(".csv"):
            continue

        current_file += 1
        print(f"\nProcessing file {current_file}/{total_files}: {file_name}")

        with z.open(file_name) as file:
            chunk_iter = pd.read_csv(file, chunksize=chunk_size)

            for chunk in chunk_iter:
                label_rows = chunk[chunk['Mineral Phase'] == LABEL]
                samples_to_add = label_rows.head(max_samples_per_chunk)
                accumulated_data.append(samples_to_add)
                label_counts[LABEL] += len(samples_to_add)

                print(f"Gathered {label_counts[LABEL]} samples for label: {LABEL}")

                if len(accumulated_data) * max_samples_per_chunk >= 1000:
                    accumulated_df = pd.concat(accumulated_data, ignore_index=True)
                    file_num = save_to_file(accumulated_df, file_num)
                    accumulated_data = []

                if label_counts[LABEL] >= desired_samples_per_label:
                    print("\nAll required samples for Fe-Oxide collected.")
                    break

            # If we've collected enough samples, break out of the file loop
            if label_counts[LABEL] >= desired_samples_per_label:
                break

if accumulated_data:
    accumulated_df = pd.concat(accumulated_data, ignore_index=True)
    save_to_file(accumulated_df, file_num)

print("\nProcessing completed.")


Processing file 1/8327: CH Counts Data/Group8_0001.csv
Gathered 5 samples for label: Fe-Oxide
Gathered 10 samples for label: Fe-Oxide
Gathered 15 samples for label: Fe-Oxide
Gathered 20 samples for label: Fe-Oxide
Gathered 25 samples for label: Fe-Oxide
Gathered 30 samples for label: Fe-Oxide
Gathered 35 samples for label: Fe-Oxide
Gathered 40 samples for label: Fe-Oxide
Gathered 45 samples for label: Fe-Oxide
Gathered 50 samples for label: Fe-Oxide
Gathered 55 samples for label: Fe-Oxide
Gathered 60 samples for label: Fe-Oxide
Gathered 65 samples for label: Fe-Oxide
Gathered 70 samples for label: Fe-Oxide
Gathered 75 samples for label: Fe-Oxide
Gathered 80 samples for label: Fe-Oxide

Processing file 2/8327: CH Counts Data/Group8_0002.csv
Gathered 85 samples for label: Fe-Oxide
Gathered 90 samples for label: Fe-Oxide
Gathered 95 samples for label: Fe-Oxide
Gathered 100 samples for label: Fe-Oxide
Gathered 105 samples for label: Fe-Oxide
Gathered 110 samples for label: Fe-Oxide
Gather

Exception ignored in: <function ZipFileSystem.__del__ at 0x000001AA6F1B8B80>
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\fsspec\implementations\zip.py", line 76, in __del__
    del self.zip
        ^^^^^^^^
AttributeError: 'ZipFileSystem' object has no attribute 'zip'
Exception ignored in: <function ZipFileSystem.__del__ at 0x000001AA6F1B8B80>
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\fsspec\implementations\zip.py", line 76, in __del__
    del self.zip
        ^^^^^^^^
AttributeError: 'ZipFileSystem' object has no attribute 'zip'


Gathered 150 samples for label: Fe-Oxide
Gathered 155 samples for label: Fe-Oxide

Processing file 3/8327: CH Counts Data/Group8_0003.csv
Gathered 160 samples for label: Fe-Oxide
Gathered 165 samples for label: Fe-Oxide
Gathered 170 samples for label: Fe-Oxide
Gathered 175 samples for label: Fe-Oxide
Gathered 180 samples for label: Fe-Oxide
Gathered 185 samples for label: Fe-Oxide
Gathered 190 samples for label: Fe-Oxide
Gathered 195 samples for label: Fe-Oxide
Gathered 200 samples for label: Fe-Oxide
Gathered 205 samples for label: Fe-Oxide
Gathered 210 samples for label: Fe-Oxide
Gathered 215 samples for label: Fe-Oxide
Gathered 220 samples for label: Fe-Oxide
Gathered 225 samples for label: Fe-Oxide
Gathered 230 samples for label: Fe-Oxide
Gathered 235 samples for label: Fe-Oxide

Processing file 4/8327: CH Counts Data/Group8_0004.csv
Gathered 240 samples for label: Fe-Oxide
Gathered 245 samples for label: Fe-Oxide
Gathered 250 samples for label: Fe-Oxide
Gathered 255 samples for la

In [9]:
import pandas as pd
import zipfile
from tqdm import tqdm
import gc

# Define constants
desired_column = "CH_4096"
zip_file_path = r"C:\Users\CH Counts Data.zip"

files_missing_column = []
file_shapes = {}

with zipfile.ZipFile(zip_file_path, 'r') as z:
    total_files = sum(1 for file_name in z.namelist() if file_name.endswith(".csv"))
    
    for file_name in tqdm(z.namelist(), desc="Processing files", unit="file"):
        if not file_name.endswith(".csv"):
            continue

        with z.open(file_name) as file:
            # Only read the first few rows for performance
            sample_data = pd.read_csv(file, nrows=10)

            # Check if the desired column exists
            if desired_column not in sample_data.columns:
                files_missing_column.append(file_name)

            # Check full shape
            file.seek(0)  # Resetting the pointer to the beginning of the file
            total_rows = sum(1 for row in file)
            file_shapes[file_name] = (total_rows, len(sample_data.columns))
            
            # Explicitly delete the DataFrame object and then call garbage collector
            del sample_data
            gc.collect()

# Display results
print("\nFiles missing the column CH_4096:")
for file in files_missing_column:
    print(file)

print("\nShapes of files:")
for file, shape in file_shapes.items():
    print(f"{file}: {shape}")

Processing files:   0%|          | 0/8328 [00:00<?, ?file/s]Exception ignored in: <function ZipFileSystem.__del__ at 0x000001AA6F1B8B80>
Traceback (most recent call last):
  File "C:\ProgramData\anaconda3\Lib\site-packages\fsspec\implementations\zip.py", line 76, in __del__
    del self.zip
        ^^^^^^^^
AttributeError: 'ZipFileSystem' object has no attribute 'zip'
Processing files: 100%|██████████| 8328/8328 [1:10:59<00:00,  1.96file/s]


Files missing the column CH_4096:
CH Counts Data/Group8_0004.csv
CH Counts Data/Group8_0021.csv
CH Counts Data/Group8_0029.csv
CH Counts Data/Group8_0047.csv
CH Counts Data/Group8_0057.csv
CH Counts Data/Group8_0058.csv
CH Counts Data/Group8_0115.csv
CH Counts Data/Group8_0146.csv
CH Counts Data/Group8_0151.csv
CH Counts Data/Group8_0154.csv
CH Counts Data/Group8_0156.csv
CH Counts Data/Group8_0176.csv
CH Counts Data/Group8_0183.csv
CH Counts Data/Group8_0201.csv
CH Counts Data/Group8_0233.csv
CH Counts Data/Group8_0253.csv
CH Counts Data/Group8_0285.csv
CH Counts Data/Group8_0289.csv
CH Counts Data/Group8_0291.csv
CH Counts Data/Group8_0326.csv
CH Counts Data/Group8_0328.csv
CH Counts Data/Group8_0329.csv
CH Counts Data/Group8_0338.csv
CH Counts Data/Group8_0391.csv
CH Counts Data/Group8_0420.csv
CH Counts Data/Group8_0453.csv
CH Counts Data/Group8_0460.csv
CH Counts Data/Group8_0513.csv
CH Counts Data/Group8_0572.csv
CH Counts Data/Group8_0574.csv
CH Counts Data/Group8_0576.csv
CH C




In [10]:
for file in files_missing_column:
    print(f"{file}: {file_shapes[file]}")

CH Counts Data/Group8_0004.csv: (7833, 4100)
CH Counts Data/Group8_0021.csv: (7744, 4100)
CH Counts Data/Group8_0029.csv: (7569, 4100)
CH Counts Data/Group8_0047.csv: (7655, 4100)
CH Counts Data/Group8_0057.csv: (7139, 4100)
CH Counts Data/Group8_0058.csv: (7744, 4100)
CH Counts Data/Group8_0115.csv: (7655, 4100)
CH Counts Data/Group8_0146.csv: (7745, 4100)
CH Counts Data/Group8_0151.csv: (7570, 4100)
CH Counts Data/Group8_0154.csv: (7655, 4100)
CH Counts Data/Group8_0156.csv: (7570, 4100)
CH Counts Data/Group8_0176.csv: (7566, 4100)
CH Counts Data/Group8_0183.csv: (7569, 4100)
CH Counts Data/Group8_0201.csv: (7483, 4100)
CH Counts Data/Group8_0233.csv: (7396, 4100)
CH Counts Data/Group8_0253.csv: (7222, 4100)
CH Counts Data/Group8_0285.csv: (6807, 4100)
CH Counts Data/Group8_0289.csv: (7397, 4100)
CH Counts Data/Group8_0291.csv: (7393, 4100)
CH Counts Data/Group8_0326.csv: (7226, 4100)
CH Counts Data/Group8_0328.csv: (7393, 4100)
CH Counts Data/Group8_0329.csv: (7566, 4100)
CH Counts 

In [4]:
import os
import zipfile
import pandas as pd
import random
import gc

desired_samples_per_label = 20000
chunk_size = 800

LABELS = [ 
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole", 
    "AlFe-Silicate", "Fe-Silicate", "Titanite", "AlMn-Silicate", 
    "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite", 
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite", 
    "Leucoxene", "Fe-Sulphide", "AlCa-Silicate", "Apatite", 
    "Xenotime", "Carbonate", "Thorite", "AlFe-Silicate_Zn", 
    "Al-Oxide", "Fe-Sulphide_Cu", "W-Oxide", "Zn-Oxide", 
    "Fe-sulphide_Pb", "Celestine", "Mn-Oxide", "Columbite_Mn", 
    "Cu-Oxide", "Nb-bearing phase", "REE-Silicate", "Molybdenite"]  # as you defined above  # Assuming you've already initialized the set of labels you're looking for

def get_output_path(label):
    return os.path.join("D:", "data", f"{label}_samples.csv")

def append_to_file(label, samples):
    output_file = get_output_path(label)
    df = pd.DataFrame(samples)
    if os.path.exists(output_file):
        df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        df.to_csv(output_file, index=False)

def process_chunk(chunk, label_samples):
    for label in LABELS:
        if len(label_samples[label]) >= desired_samples_per_label:
            continue

        label_rows = chunk[chunk['Mineral Phase'] == label]
        needed_samples = desired_samples_per_label - len(label_samples[label])
        samples = label_rows.sample(n=min(needed_samples, len(label_rows)), replace=False)
        label_samples[label].extend(samples.to_dict('records'))

zip_file_path = r"C:\Users\CH Counts Data.zip"
with zipfile.ZipFile(zip_file_path, 'r') as z:
    csv_files = [f for f in z.namelist() if f.endswith(".csv")]

    # Shuffle files randomly
    random.shuffle(csv_files)

    # Store samples for each label
    label_samples = {label: [] for label in LABELS}

    for idx, file_name in enumerate(csv_files):
        print(f"\nProcessing file {idx+1}/{len(csv_files)}: {file_name}")

        with z.open(file_name) as file:
            chunk_iter = pd.read_csv(file, chunksize=chunk_size)
            for chunk_idx, chunk in enumerate(chunk_iter):
                process_chunk(chunk, label_samples)
                print(f" - Processed chunk {chunk_idx + 1}")

            # Save accumulated data for each label and reset memory
            for label, samples in label_samples.items():
                if samples:
                    append_to_file(label, samples)
                    label_samples[label] = []  # Resetting the samples list for that label
                    print(f" - Appended data to {get_output_path(label)}")

            completed_labels = [label for label, samples in label_samples.items() if len(samples) >= desired_samples_per_label]
            for label in completed_labels:
                print(f" - Completed gathering {desired_samples_per_label} samples for label: {label}")
                LABELS.remove(label)
                del label_samples[label]

            # Additional clean-up after each file
            del chunk_iter
            gc.collect()

            if len(completed_labels) >= 25:
                print("\nCompleted gathering data for 25 labels.")
                break

print("\nProcessing completed.")


Processing file 1/8327: CH Counts Data/Group8_7038.csv
 - Processed chunk 1
 - Processed chunk 2
 - Processed chunk 3
 - Processed chunk 4
 - Processed chunk 5
 - Processed chunk 6
 - Processed chunk 7
 - Processed chunk 8
 - Processed chunk 9
 - Processed chunk 10
 - Appended data to D:data\Fe-Oxide_samples.csv
 - Appended data to D:data\Background_samples.csv
 - Appended data to D:data\Ilmenite_samples.csv
 - Appended data to D:data\Epidote_samples.csv
 - Appended data to D:data\Amphibole_samples.csv
 - Appended data to D:data\AlFe-Silicate_samples.csv
 - Appended data to D:data\Fe-Silicate_samples.csv
 - Appended data to D:data\Titanite_samples.csv
 - Appended data to D:data\AlMn-Silicate_samples.csv
 - Appended data to D:data\Rutile_samples.csv
 - Appended data to D:data\Monazite_samples.csv
 - Appended data to D:data\Quartz_samples.csv
 - Appended data to D:data\AlK-Silicate_samples.csv
 - Appended data to D:data\Al-Silicate_samples.csv
 - Appended data to D:data\Leucoxene_sample

KeyboardInterrupt: 

In [4]:
#sorting out the files for individual label 
import os
import pandas as pd
import random
import csv

# File paths and parameters
raw_data_folder = 'D:/CP_raw_data/'
processed_data_folder = 'D:/CP_processed_data/'
total_sample_size = 20000  # total samples to select

# List of candidates
LABELS = [
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole", 
    "AlFe-Silicate", "Fe-Silicate", "Titanite", "AlMn-Silicate", 
    "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite", 
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite", 
    "Leucoxene", "Fe-Sulphide", "AlCa-Silicate", "Apatite", 
    "Xenotime", "Carbonate", "Thorite", "AlFe-Silicate_Zn", 
    "Al-Oxide", "Fe-Sulphide_Cu", "W-Oxide", "Zn-Oxide", 
    "Fe-sulphide_Pb", "Celestine", "Mn-Oxide", "Columbite_Mn", 
    "Cu-Oxide", "Nb-bearing phase", "REE-Silicate", "Molybdenite"]

# List of columns to drop
cols_to_drop = ['x', 'y', 'yinv'] + ['CH_{}'.format(i) for i in range(2501, 4097)]

# Process each file
for label in LABELS:
    raw_data_path = os.path.join(raw_data_folder, f"{label}_samples.csv")

    # Check if file exists
    if not os.path.exists(raw_data_path):
        print(f"No file found for label: {label}")
        continue

    print(f"Processing file for label: {label}")

    # Reservoir sampling for large files
    samples = []
    with open(raw_data_path, 'r') as file:
        reader = csv.reader(file)
        for i, row in enumerate(reader):
            if i == 0:  # Skip the header row
                continue
            if len(samples) < total_sample_size:
                samples.append(row)
            elif random.random() < total_sample_size / (i + 1):
                replace = random.randint(0, total_sample_size - 1)
                samples[replace] = row

    # Convert to DataFrame
    samples_df = pd.DataFrame(samples, columns=['x', 'y', 'yinv'] + ['CH_{}'.format(i) for i in range(1, 4097)] + ['Sum', 'Mineral Phase'])

    # Drop unwanted columns
    print(f"Dropping unwanted columns...")
    samples_df.drop(columns=cols_to_drop, errors='ignore', inplace=True)
    print("Columns dropped successfully.")

    # Save the processed data to a new file
    processed_data_path = os.path.join(processed_data_folder, f"{label}.xlsx")
    print(f"Saving processed data to {processed_data_path}...")
    samples_df.to_excel(processed_data_path, index=False)
    print(f"Data saved successfully for label: {label}\n")

Processing file for label: Fe-Oxide
Dropping unwanted columns...
Columns dropped successfully.
Saving processed data to D:/CP_processed_data/Fe-Oxide.xlsx...



KeyboardInterrupt



In [1]:
pip install dask

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import dask.dataframe as dd

# File paths and parameters
raw_data_folder = 'D:/CP_raw_data/'
processed_data_folder = 'D:/CP_processed_data/'
size_threshold_kb = 100000  # 100,000 KB

# List of candidates
LABELS = ["AlFe-Silicate", "Monazite", "Nb-bearing phase"]

# Specify the columns to keep
keep_columns = ['CH_{}'.format(i) for i in range(1, 2501)] + ['Mineral Phase']

# Process each file
for label in LABELS:
    raw_data_path = os.path.join(raw_data_folder, f"{label}_samples.csv")

    # Check if file exists
    if not os.path.exists(raw_data_path):
        print(f"No file found for label: {label}")
        continue

    print(f"Processing file for label: {label}")

    # Check file size
    file_size_kb = os.path.getsize(raw_data_path) / 1024  # file size in KB

    # Decide the total sample size based on file size
    total_sample_size = 20000 if file_size_kb > size_threshold_kb else 10000
    print(f"Total sample size: {total_sample_size}")

    # Using Dask to load the data with selected columns
    ddf = dd.read_csv(raw_data_path, usecols=keep_columns, dtype=str, low_memory=False)
    
    # Perform sampling
    ddf = ddf.sample(frac=total_sample_size/len(ddf), random_state=1)
    
    # Save the processed data to a new file
    processed_data_path = os.path.join(processed_data_folder, f"{label}.csv")
    ddf.to_csv(processed_data_path, index=False, single_file=True)
    print(f"Data saved successfully for label: {label}\n")


Processing file for label: AlFe-Silicate
Total sample size: 20000
Data saved successfully for label: AlFe-Silicate

Processing file for label: Monazite
Total sample size: 20000
Data saved successfully for label: Monazite

Processing file for label: Nb-bearing phase
Total sample size: 10000


ValueError: Replace has to be set to `True` when upsampling the population `frac` > 1.