These codes originally created by Ming and I just remodified it to work on google colab version, in order for me to preprocess client datasets

In [None]:
# Mount Google Drive (run this code once to mount the drive)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
zip_file_path = '/content/drive/MyDrive/ColabNotebooks/CH Counts Data.zip'

In [None]:
import pandas as pd
import zipfile
import os
import gc
import random

# Define constants
LABELS = [
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole",
    "AlFe-Silicate", "Fe-Silicate", "Titanite", "AlMn-Silicate",
    "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite",
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite",
    "Leucoxene", "Fe-Sulphide", "AlCa-Silicate", "Apatite",
    "Xenotime", "Carbonate", "Thorite", "AlFe-Silicate_Zn",
    "Al-Oxide", "Fe-Sulphide_Cu", "W-Oxide", "Zn-Oxide",
    "Fe-sulphide_Pb", "Celestine", "Mn-Oxide", "Columbite_Mn",
    "Cu-Oxide", "Nb-bearing phase", "REE-Silicate", "Molybdenite"
]

TOTAL_SAMPLES = {
    "Fe-Oxide": 20883184,
    "Background": 11347230,
    "Ilmenite": 8400629,
    "Epidote": 5195663,
    "Amphibole": 4404669,
    "AlFe-Silicate": 4106622,
    "Fe-Silicate": 1784892,
    "Titanite": 1512827,
    "AlMn-Silicate": 1326232,
    "Rutile": 876502,
    "Zircon": 618053,
    "Monazite": 408111,
    "Quartz": 404580,
    "Cassiterite": 371466,
    "AlK-Silicate": 325675,
    "Barite": 281363,
    "Al-Silicate": 272677,
    "Chromite": 266954,
    "Leucoxene": 150346,
    "Fe-Sulphide": 146294,
    "AlCa-Silicate": 125414,
    "Apatite": 92279,
    "Xenotime": 66880,
    "Carbonate": 62444,
    "Thorite": 48635,
    "AlFe-Silicate_Zn": 34112,
    "Al-Oxide": 33373,
    "Fe-Sulphide_Cu": 30730,
    "W-Oxide": 28626,
    "Zn-Oxide": 27213,
    "Fe-sulphide_Pb": 26963,
    "Celestine": 26574,
    "Mn-Oxide": 25292,
    "Columbite_Mn": 23806,
    "Cu-Oxide": 19951,
    "Nb-bearing phase": 14705,
    "REE-Silicate": 13791,
    "Molybdenite": 12023
}

desired_samples_per_label = 5000
chunk_size = 100000

def dynamic_max_samples(label):
    return round(chunk_size * (desired_samples_per_label / TOTAL_SAMPLES[label]))

def save_to_file(accumulated_df, file_num):
    output_file = f"/content/drive/MyDrive/ColabNotebooks/data/combined_data_{file_num}.csv"
    accumulated_df.to_csv(output_file, index=False)
    return file_num + 1

def process_chunk(chunk, label_counts):
    samples_to_add = []
    for label in LABELS:
        if label_counts[label] >= desired_samples_per_label:
            continue
        label_rows = chunk[chunk['Mineral Phase'] == label]
        max_samples_for_label = min(dynamic_max_samples(label), desired_samples_per_label - label_counts[label])
        samples = label_rows.sample(n=min(max_samples_for_label, len(label_rows)), replace=False)
        samples_to_add.extend(samples.to_dict('records'))
        label_counts[label] += len(samples)
    return samples_to_add

accumulated_data = []
label_counts = {label: 0 for label in LABELS}
file_num = 1

with zipfile.ZipFile(zip_file_path, 'r') as z:
    csv_files = [f for f in z.namelist() if f.endswith(".csv")]

    # Find the starting file within the list.
    starting_file = next((file for file in csv_files if "Group8_0001.csv" in file), None)
    if starting_file:
        csv_files.remove(starting_file)
        random.shuffle(csv_files)
        csv_files.insert(0, starting_file)
    else:
        print("Warning: Group8_0001.csv not found in the archive. Processing in random order.")
        random.shuffle(csv_files)

    for idx, file_name in enumerate(csv_files):
        print(f"Processing file {idx+1}/{len(csv_files)}: {file_name}")

        previous_counts = label_counts.copy()

        with z.open(file_name) as file:
            chunk_iter = pd.read_csv(file, chunksize=chunk_size)
            for chunk in chunk_iter:
                accumulated_data.extend(process_chunk(chunk, label_counts))

                if len(accumulated_data) >= 2000:
                    accumulated_df = pd.DataFrame(accumulated_data)
                    file_num = save_to_file(accumulated_df, file_num)
                    accumulated_data = []
                    del accumulated_df
                    gc.collect()

                if all(count >= desired_samples_per_label for count in label_counts.values()):
                    break

            # Additional clean-up after each file
            del chunk_iter
            gc.collect()

            for label, count in label_counts.items():
                if previous_counts[label] != count:
                    print(f"Gathered {count} samples for label: {label}")

            if all(count >= desired_samples_per_label for count in label_counts.values()):
                break

if accumulated_data:
    accumulated_df = pd.DataFrame(accumulated_data)
    save_to_file(accumulated_df, file_num)
    del accumulated_df
    gc.collect()

print("Processing completed.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Gathered 861 samples for label: Al-Oxide
Processing file 200/8327: CH Counts Data/Group8_0317.csv
Gathered 4799 samples for label: Fe-Oxide
Gathered 4714 samples for label: Barite
Gathered 3468 samples for label: Leucoxene
Gathered 974 samples for label: Carbonate
Gathered 640 samples for label: Mn-Oxide
Processing file 201/8327: CH Counts Data/Group8_6588.csv
Gathered 4823 samples for label: Fe-Oxide
Gathered 3474 samples for label: Leucoxene
Gathered 2763 samples for label: AlCa-Silicate
Gathered 1992 samples for label: Apatite
Gathered 641 samples for label: Mn-Oxide
Processing file 202/8327: CH Counts Data/Group8_2183.csv
Gathered 4847 samples for label: Fe-Oxide
Processing file 203/8327: CH Counts Data/Group8_7888.csv
Gathered 4871 samples for label: Fe-Oxide
Gathered 3476 samples for label: Leucoxene
Gathered 2851 samples for label: AlCa-Silicate
Gathered 1994 samples for label: Apatite
Gathered 862 samples for labe

OSError: ignored

In [None]:
import pandas as pd


#Concatanate output files

# Specify Input Files
input_folder = '/content/drive/MyDrive/ColabNotebooks/data/'
input_files = [f"{input_folder}combined_data_{file_num}.csv" for file_num in range(1, 75)]

# Concatenate CSVs
concatenated_df = pd.concat([pd.read_csv(file) for file in input_files], ignore_index=True)

# Save Concatenated Data
output_file = f"{input_folder}concatenated_data.csv"
concatenated_df.to_csv(output_file, index=False)


In [None]:
import pandas as pd

# Specify the path to the concatenated CSV file
concatenated_file_path = '/content/drive/MyDrive/ColabNotebooks/data/concatenated_data_1000.csv'

# Read the concatenated CSV file into a DataFrame
concatenated_df = pd.read_csv(concatenated_file_path)

# Calculate and print the summary of label counts
label_counts = concatenated_df['Mineral Phase'].value_counts()
print(label_counts)


Ilmenite            1000
Xenotime            1000
Al-Silicate         1000
Al-Oxide            1000
Fe-Sulphide         1000
Barite              1000
Carbonate           1000
Fe-sulphide_Pb      1000
Cassiterite         1000
Apatite             1000
Thorite             1000
Fe-Sulphide_Cu      1000
Columbite_Mn        1000
Cu-Oxide            1000
Celestine           1000
W-Oxide             1000
AlFe-Silicate       1000
Chromite            1000
Zircon              1000
Mn-Oxide            1000
Fe-Silicate         1000
Quartz              1000
AlK-Silicate        1000
AlFe-Silicate_Zn    1000
Zn-Oxide            1000
Rutile              1000
Molybdenite         1000
Titanite            1000
AlMn-Silicate       1000
Monazite            1000
Leucoxene           1000
AlCa-Silicate       1000
Amphibole            945
Epidote              903
Nb-bearing phase     889
REE-Silicate         839
Name: Mineral Phase, dtype: int64


In [None]:
# Calculate the total number of unique labels
total_labels = concatenated_df['Mineral Phase'].nunique()

# Print the total number of unique labels
print("Total Unique Labels:", total_labels)

Total Unique Labels: 38
