## Random Selected 10-combined for testing

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('10_combined.csv')
fe_oxide = df[df['Mineral Phase'] == 'Fe-Oxide'].sample(7000, random_state=42)
background = df[df['Mineral Phase'] == 'Background'].sample(7000, random_state=42)

# Exclude 'Fe-Oxide' and 'Background' from the original dataframe
df = df[~df['Mineral Phase'].isin(['Fe-Oxide', 'Background'])]

# Concatenate the downsampled sets with the remaining dataframe
df = pd.concat([df, fe_oxide, background], axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.to_csv('10_combined_shuffled.csv', index=False)

In [3]:
df['Mineral Phase'].value_counts()

Fe-Oxide            7000
Background          7000
Ilmenite            5808
Epidote             5193
AlFe-Silicate       4306
Amphibole           3984
AlMn-Silicate       3864
Titanite            1768
Fe-Silicate         1229
Fe-Sulphide         1079
Rutile               611
Unknown              429
Monazite             408
AlK-Silicate         338
Tourmaline           204
Quartz               182
Zircon               179
Al-Silicate          124
AlCa-Silicate         69
RM ONLY               56
Xenotime              35
Zn-Oxide              33
Chromite              22
Al-Oxide              18
Cassiterite           15
Barite                15
REE-Silicate          14
Celestine             13
Mn-Oxide              12
Leucoxene             12
AlFe-Silicate_Zn      10
Apatite                6
Fe-Sulphide_Zn         5
Thorite                2
Fe-sulphide_Pb         1
MgAl-Oxide             1
Name: Mineral Phase, dtype: int64

In [5]:
df.shape

(44045, 4101)

## Balanced Randomly Selected Testing Set

In [14]:
import pandas as pd
import os

labels = [
    "Fe-Oxide", "Background", "Ilmenite", "Epidote", "Amphibole", "AlFe-Silicate", "Fe-Silicate",
    "Titanite", "AlMn-Silicate", "Rutile", "Zircon", "Monazite", "Quartz", "Cassiterite",
    "AlK-Silicate", "Barite", "Al-Silicate", "Chromite", "Leucoxene", "Fe-Sulphide",
    "AlCa-Silicate", "Apatite", "Xenotime", "Carbonate"
]

# Construct expected file names based on labels
expected_files = [label + ".csv" for label in labels]

# List all files in the directory
files = [f for f in os.listdir("E:\\CP_processed_data") if os.path.isfile(os.path.join("E:\\CP_processed_data", f))]

# Filter out the relevant files that match our expected file names
relevant_files = [f for f in files if f in expected_files]

# Create an empty dataframe to store our combined data
all_data = pd.DataFrame()

# Process each relevant file
for file in relevant_files:
    path = os.path.join("E:\\CP_processed_data", file)
    print(f"Processing file: {file}")
    
    # Read the entire file
    data = pd.read_csv(path)
    
    # Ensure there are at least 2000 rows to sample from
    if len(data) >= 3000:
        # Randomly sample 2000 rows from the file
        sampled_data = data.sample(n=3000, replace=False, random_state=1)
        all_data = pd.concat([all_data, sampled_data])
        print(f"Sampled 3000 rows from {file}.")
    else:
        print(f"Warning: {file} has less than 3000 rows. Skipping.")

# Reset index for the new combined dataset
all_data.reset_index(drop=True, inplace=True)

print("\nProcessing complete. Final dataset shape:", all_data.shape)

Processing file: Fe-Oxide.csv
Sampled 3000 rows from Fe-Oxide.csv.
Processing file: Background.csv
Sampled 3000 rows from Background.csv.
Processing file: Ilmenite.csv
Sampled 3000 rows from Ilmenite.csv.
Processing file: Epidote.csv
Sampled 3000 rows from Epidote.csv.
Processing file: Amphibole.csv
Sampled 3000 rows from Amphibole.csv.
Processing file: AlFe-Silicate.csv
Sampled 3000 rows from AlFe-Silicate.csv.
Processing file: Fe-Silicate.csv
Sampled 3000 rows from Fe-Silicate.csv.
Processing file: Titanite.csv
Sampled 3000 rows from Titanite.csv.
Processing file: AlMn-Silicate.csv
Sampled 3000 rows from AlMn-Silicate.csv.
Processing file: Rutile.csv
Sampled 3000 rows from Rutile.csv.
Processing file: Zircon.csv
Sampled 3000 rows from Zircon.csv.
Processing file: Monazite.csv
Sampled 3000 rows from Monazite.csv.
Processing file: Quartz.csv
Sampled 3000 rows from Quartz.csv.
Processing file: Cassiterite.csv
Sampled 3000 rows from Cassiterite.csv.
Processing file: AlK-Silicate.csv
Samp

In [17]:
# Retain the desired columns
desired_columns = [f"CH_{i}" for i in range(121, 2331)] + ['Mineral Phase']
all_data = all_data[desired_columns]

# Aggregate every 8 columns
aggregated_data = []
for i in range(121, 2324, 8):  # 2324 is 2330 - 8 + 1
    col_names = [f"CH_{j}" for j in range(i, i+8)]
    all_data[f"Agg_{i}_{i+7}"] = all_data[col_names].sum(axis=1)
    aggregated_data.append(f"Agg_{i}_{i+7}")

# Keep only aggregated columns and 'Mineral Phase'
all_data = all_data[aggregated_data + ['Mineral Phase']]
    
# Shuffle the dataset
all_data_shuffled = all_data.sample(frac=1).reset_index(drop=True)

In [18]:
# Print the shape and value counts
print(f"Shape of the dataset: {all_data_shuffled.shape}")
print("\nLabel Counts:")
print(all_data_shuffled['Mineral Phase'].value_counts())

Shape of the dataset: (72000, 277)

Label Counts:
Carbonate        3000
Xenotime         3000
Zircon           3000
AlFe-Silicate    3000
Ilmenite         3000
Titanite         3000
Apatite          3000
Amphibole        3000
AlCa-Silicate    3000
Leucoxene        3000
Monazite         3000
Cassiterite      3000
Al-Silicate      3000
Epidote          3000
Rutile           3000
Fe-Silicate      3000
AlK-Silicate     3000
Quartz           3000
Background       3000
AlMn-Silicate    3000
Barite           3000
Chromite         3000
Fe-Sulphide      3000
Fe-Oxide         3000
Name: Mineral Phase, dtype: int64


In [19]:
all_data_shuffled.to_csv('balanced_testing_set.csv', index=False)