In [None]:
# This script trains the respective models using the data prepared from DataPreparation.ipynb.
# There are 2 parts to this
#
# A) We train & evaluate using Classification Models with Target['T']
#    to predict the age of volcanic rocks.
#    Models to be tested are: Logistic Regression (Baseline); Random Forest and 1 Neural Network
#    Metrics used are (accuracy_score, F1_score). Precision and Recall - are produced in the Classification Report
#    Our performance metric for (A) is F1-score coz its gives a balance between precision and recall.
#
# B) We will extend the training and evaluation further by using Linear Regression with Target['Mg#']
#    to predict which dataset elements(EMPA or Laser)  have a stronger influence on Mg#
#    WHy Mg#? - Mg# is mathematically derived from other elements and it has some influence on Age of rocks.
#    Metrics used are (r2_score and mean_squared_error).  Metric focus will be r2_score.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import numpy as np
import pandas as pd
import warnings
import os

from pathlib import Path

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
    mean_squared_error,
    r2_score
)

# setting deterministic behaviour for ANN training
# to get consistency and reduce run-to-run variability on the Neural Network results
os.environ['PYTHONHASHSEED'] = '42'
os.environ['TF_DETERMINISTIC_OPS'] = '1'   # best-effort deterministic ops
import random
random.seed(42)
np.random.seed(42)

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

#
# Set random seed to 42
# so we get the same result everytime this code runs
#
tf.random.set_seed(42)

Mounted at /content/drive


In [None]:
# Paths (Making directory)
ROOT = Path("/content/drive/MyDrive/Intro2Prog")
DATA_PREP = ROOT / "Data_Prepared"
RESULTS   = ROOT / "Results"
LR_DIR = RESULTS / "Logistic_Regression"
RF_DIR = RESULTS / "Random_Forest"
NN_DIR = RESULTS / "Neural_Network"

REG_DIR = RESULTS / "Linear_Regression"

RESULTS.mkdir(parents=True, exist_ok=True)
LR_DIR.mkdir(parents=True, exist_ok=True)
RF_DIR.mkdir(parents=True, exist_ok=True)
NN_DIR.mkdir(parents=True, exist_ok=True)

REG_DIR.mkdir(parents=True, exist_ok=True)

# Initiatizing results var to capture the performance metrics for (A - Classification)
results = []
# Initializing reg_results var to capture performance metrics for (B - Regression)
reg_results = []


In [None]:
# Loading Prepared Data for (A - Classification)
#
# Empa Dataset
X_empa_train = np.load(DATA_PREP / "X_empa_train.npy")
X_empa_test  = np.load(DATA_PREP / "X_empa_test.npy")
y_empa_train = np.load(DATA_PREP / "y_empa_train.npy")
y_empa_test  = np.load(DATA_PREP / "y_empa_test.npy")

# Laser Dataset
X_laser_train = np.load(DATA_PREP / "X_laser_train.npy")
X_laser_test  = np.load(DATA_PREP / "X_laser_test.npy")
y_laser_train = np.load(DATA_PREP / "y_laser_train.npy")
y_laser_test  = np.load(DATA_PREP / "y_laser_test.npy")

# Combined
X_comb_train = np.load(DATA_PREP / "X_comb_train.npy")
X_comb_test  = np.load(DATA_PREP / "X_comb_test.npy")
y_comb_train = np.load(DATA_PREP / "y_comb_train.npy")
y_comb_test  = np.load(DATA_PREP / "y_comb_test.npy")

# Loading Prepared Data for (B - Regression)
# EMPA - FULL
X_empa_reg_train = np.load(DATA_PREP / "X_empa_reg_train.npy")
X_empa_reg_test  = np.load(DATA_PREP / "X_empa_reg_test.npy")
y_empa_reg_train = np.load(DATA_PREP / "y_empa_reg_train.npy")
y_empa_reg_test  = np.load(DATA_PREP / "y_empa_reg_test.npy")

# LASER
X_laser_reg_train = np.load(DATA_PREP / "X_laser_reg_train.npy")
X_laser_reg_test  = np.load(DATA_PREP / "X_laser_reg_test.npy")
y_laser_reg_train = np.load(DATA_PREP / "y_laser_reg_train.npy")
y_laser_reg_test  = np.load(DATA_PREP / "y_laser_reg_test.npy")

# COMBINED
X_comb_reg_train = np.load(DATA_PREP / "X_comb_reg_train.npy")
X_comb_reg_test  = np.load(DATA_PREP / "X_comb_reg_test.npy")
y_comb_reg_train = np.load(DATA_PREP / "y_comb_reg_train.npy")
y_comb_reg_test  = np.load(DATA_PREP / "y_comb_reg_test.npy")


In [None]:
###################################################################################################################
#
# Part A - Predicting the Age of Volcanic Rocks setting column['T] as Target and using ML and Deep Learning Models
# - Training with Logistic Regession for EMPA(653 rows), Laser(189 rows) and Combined(EMPA Reduced(189) + Laser (189)
#
###################################################################################################################

In [None]:
####################################
# Training with Logistic Regression
###################################

# 1. EMPA Dataset
#
lr_empa = LogisticRegression(max_iter=1000) #sets the maximum number of steps the model can take to learn
lr_empa.fit(X_empa_train, y_empa_train)

y_pred = lr_empa.predict(X_empa_test)

acc = accuracy_score(y_empa_test, y_pred)
f1  = f1_score(y_empa_test, y_pred, average="weighted") #For multiple classes

report = classification_report(y_empa_test, y_pred)

print("\nLogistic Regression — EMPA")
print(report)

#Opens up classification file to write to it
with open(LR_DIR / "EMPA_classification_report.txt", "w") as f:
    f.write(report)

results.append(["EMPA", "Logistic Regression", acc, f1])



Logistic Regression — EMPA
              precision    recall  f1-score   support

           0       0.59      0.60      0.60        53
           1       0.73      0.85      0.79        52
           2       0.53      0.35      0.42        26

    accuracy                           0.65       131
   macro avg       0.62      0.60      0.60       131
weighted avg       0.64      0.65      0.64       131



In [None]:
#
# 2. Laser Dataset
#
lr_laser = LogisticRegression(max_iter=1000)
lr_laser.fit(X_laser_train, y_laser_train)

y_pred = lr_laser.predict(X_laser_test)

acc = accuracy_score(y_laser_test, y_pred)
f1  = f1_score(y_laser_test, y_pred, average="weighted")

report = classification_report(y_laser_test, y_pred)

print("\nLogistic Regression — Laser")
print(report)

with open(LR_DIR / "Laser_classification_report.txt", "w") as f:
    f.write(report)

results.append(["Laser", "Logistic Regression", acc, f1])


Logistic Regression — Laser
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        16
           1       0.75      0.90      0.82        10
           2       1.00      0.92      0.96        12

    accuracy                           0.87        38
   macro avg       0.87      0.88      0.87        38
weighted avg       0.88      0.87      0.87        38



In [None]:
#
# 3. Combined Dataset
#
lr_comb = LogisticRegression(max_iter=1000)
lr_comb.fit(X_comb_train, y_comb_train)

y_pred = lr_comb.predict(X_comb_test)

acc = accuracy_score(y_comb_test, y_pred)
f1  = f1_score(y_comb_test, y_pred, average="weighted")

report = classification_report(y_comb_test, y_pred)

print("\nLogistic Regression — Combined")
print(report)

with open(LR_DIR / "Combined_classification_report.txt", "w") as f:
    f.write(report)

results.append(["Combined", "Logistic Regression", acc, f1])


Logistic Regression — Combined
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        16
           1       1.00      1.00      1.00        10
           2       1.00      0.92      0.96        12

    accuracy                           0.97        38
   macro avg       0.98      0.97      0.98        38
weighted avg       0.98      0.97      0.97        38



In [None]:
##################################################
# Training with Random Forest
##################################################

# 1. Empa Dataset
#
rf_empa = RandomForestClassifier(n_estimators=200, random_state=42) #200decision trees, reproducible results
rf_empa.fit(X_empa_train, y_empa_train)

y_pred = rf_empa.predict(X_empa_test)

acc = accuracy_score(y_empa_test, y_pred)
f1  = f1_score(y_empa_test, y_pred, average="weighted") #For multiple classes

report = classification_report(y_empa_test, y_pred)

print("\nRandom Forest — EMPA")
print(report)

with open(RF_DIR / "EMPA_classification_report.txt", "w") as f:
    f.write(report)

results.append(["EMPA", "Random Forest", acc, f1])


Random Forest — EMPA
              precision    recall  f1-score   support

           0       0.77      0.77      0.77        53
           1       0.74      0.87      0.80        52
           2       0.82      0.54      0.65        26

    accuracy                           0.76       131
   macro avg       0.78      0.73      0.74       131
weighted avg       0.77      0.76      0.76       131



In [None]:
#
# 2. Laser Dataset
#
print("\nRandom Forest — Laser")

rf_laser = RandomForestClassifier(n_estimators=200, random_state=42)
rf_laser.fit(X_laser_train, y_laser_train)

y_pred = rf_laser.predict(X_laser_test)

acc = accuracy_score(y_laser_test, y_pred)
f1  = f1_score(y_laser_test, y_pred, average="weighted")

report = classification_report(y_laser_test, y_pred)

print("\nRandom Forest — LASER")
print(report)

with open(RF_DIR / "LASER_classification_report.txt", "w") as f:
    f.write(report)

results.append(["LASER", "Random Forest", acc, f1])


Random Forest — Laser

Random Forest — LASER
              precision    recall  f1-score   support

           0       0.87      0.81      0.84        16
           1       0.73      0.80      0.76        10
           2       1.00      1.00      1.00        12

    accuracy                           0.87        38
   macro avg       0.86      0.87      0.87        38
weighted avg       0.87      0.87      0.87        38



In [None]:
#
# 3. Combined Dataset
#
print("\nRandom Forest — Combined")

rf_comb = RandomForestClassifier(n_estimators=200, random_state=42)
rf_comb.fit(X_comb_train, y_comb_train)

y_pred = rf_comb.predict(X_comb_test)

acc = accuracy_score(y_comb_test, y_pred)
f1  = f1_score(y_comb_test, y_pred, average="weighted")

report = classification_report(y_comb_test, y_pred)

print("\nRandom Forest — Combined")
print(report)

with open(RF_DIR / "Combined_classification_report.txt", "w") as f:
    f.write(report)

results.append(["Combined", "Random Forest", acc, f1])


Random Forest — Combined

Random Forest — Combined
              precision    recall  f1-score   support

           0       0.93      0.81      0.87        16
           1       0.75      0.90      0.82        10
           2       1.00      1.00      1.00        12

    accuracy                           0.89        38
   macro avg       0.89      0.90      0.89        38
weighted avg       0.90      0.89      0.90        38



In [None]:
#####################################################
# Training with Neural Network
#####################################################

# Setting up the parameters for Neural Network

def build_nn(input_dim): #Creates and returns a feed-forward neural network
    model = Sequential() #stack layers one after another in a linear order
    model.add(Input(shape=(input_dim,))) #takes input_dim numeric features
    model.add(Dense(64, activation="relu")) #learn features efficiently
    model.add(Dense(32, activation="relu"))
    model.add(Dense(3, activation="softmax")) #predicting 3 classes.

    model.compile(
        optimizer=Adam(learning_rate=0.001), #updates weights during training (Adam is good default choice)
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"] #measures how often the model predicts correctly.
    )
    return model

In [None]:
#
# 1. Empa Dataset
#
model = build_nn(X_empa_train.shape[1]) #builds the neural network using the number of input features from the training data.

model.fit(
    X_empa_train, y_empa_train,
    epochs=50, batch_size=32, verbose=0
) #epoch goes through training data; training data split into mini batches of 32samples; verbose gives 0 output

loss, acc = model.evaluate(X_empa_test, y_empa_test, verbose=0) #evaluates how well the trained model performs on unseen test data.
print("\nNeural Network — EMPA") #loss= The value of the loss function (how wrong the predictions are).
print("Accuracy:", acc) #acc= The fraction of correct predictions.

y_pred = model.predict(X_empa_test).argmax(axis=1) #multiclass classification

acc = accuracy_score(y_empa_test, y_pred)
f1  = f1_score(y_empa_test, y_pred, average="weighted")

report = classification_report(y_empa_test, y_pred)
print(report)

with open(NN_DIR / "EMPA_classification_report.txt", "w") as f:
    f.write(report)

results.append(["EMPA", "Neural Network", acc, f1])


Neural Network — EMPA
Accuracy: 0.8015267252922058
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        53
           1       0.76      0.92      0.83        52
           2       1.00      0.58      0.73        26

    accuracy                           0.80       131
   macro avg       0.85      0.76      0.79       131
weighted avg       0.82      0.80      0.80       131



In [None]:
#
# 2. Laser Dataset
#
model = build_nn(X_laser_train.shape[1])

model.fit(
    X_laser_train, y_laser_train,
    epochs=50, batch_size=32, verbose=0
)

loss, acc = model.evaluate(X_laser_test, y_laser_test, verbose=0)
print("\nNeural Network — Laser")
print("Accuracy:", acc)

y_pred = model.predict(X_laser_test).argmax(axis=1)

acc = accuracy_score(y_laser_test, y_pred)
f1  = f1_score(y_laser_test, y_pred, average="weighted")

report = classification_report(y_laser_test, y_pred)
print(report)

with open(NN_DIR / "Laser_classification_report.txt", "w") as f:
    f.write(report)

results.append(["Laser", "Neural Network", acc, f1])


Neural Network — Laser
Accuracy: 0.8947368264198303
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
              precision    recall  f1-score   support

           0       0.93      0.81      0.87        16
           1       0.75      0.90      0.82        10
           2       1.00      1.00      1.00        12

    accuracy                           0.89        38
   macro avg       0.89      0.90      0.89        38
weighted avg       0.90      0.89      0.90        38



In [None]:
#
# 3. Combined
#
model = build_nn(X_comb_train.shape[1])

model.fit(
    X_comb_train, y_comb_train,
    epochs=50, batch_size=32, verbose=0
)

loss, acc = model.evaluate(X_comb_test, y_comb_test, verbose=0)
print("\nNeural Network — Combined")
print("Accuracy:", acc)

y_pred = model.predict(X_comb_test).argmax(axis=1)

acc = accuracy_score(y_comb_test, y_pred)
f1  = f1_score(y_comb_test, y_pred, average="weighted")

report = classification_report(y_comb_test, y_pred)
print(report)

with open(NN_DIR / "Combined_classification_report.txt", "w") as f:
    f.write(report)

results.append(["Combined", "Neural Network", acc, f1])




Neural Network — Combined
Accuracy: 0.9210526347160339
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 59ms/step



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
              precision    recall  f1-score   support

           0       0.88      0.94      0.91        16
           1       0.91      1.00      0.95        10
           2       1.00      0.83      0.91        12

    accuracy                           0.92        38
   macro avg       0.93      0.92      0.92        38
weighted avg       0.93      0.92      0.92        38



Saving all the Classification metrics results to a csv file.

In [None]:
################################################################
# Saving the Consolidated Classification performance metrics to a csv file
################################################################

results_df = pd.DataFrame(
    results,
    columns=["Dataset", "Model", "Accuracy", "F1_score"]
)

results_df.to_csv(RESULTS / "classification_summary.csv", index=False)

In [None]:
# #########################################################################################################
# Part B - Determining which dataset has a greater influence on MG# value using Linear Regression.
# - MG# is a mathematically derived value from elements and has an influence
#   on the age of volcanic rocks.
# - 3 Datasets (similar to the Classification but not identical coz of the different Target)
#   have been prepared from 2_DataPreparation:
#   - EMPA (653 rows)
#   - Laser (189 rows)
#   - Combined EMPA + Laser (189 rows)
##########################################################################################################

In [None]:
################################################
# Training with Linear Regression
###############################################

#
# 1. EMPA Dataset
#
lr = LinearRegression()
lr.fit(X_empa_reg_train, y_empa_reg_train)

y_pred = lr.predict(X_empa_reg_test)

mse  = mean_squared_error(y_empa_reg_test, y_pred) #Average of squared prediction errors. Penalizes large errors. Lower is better.
rmse = np.sqrt(mse) #Square root of MSE. Shows average error in the same units as the target. Lower is better.
r2   = r2_score(y_empa_reg_test, y_pred) #Measures how much of the data’s variation the model explains. (1=perfect,0=no better than guessing, negative=very bad)

print("\nLinear Regression — EMPA")
print("RMSE:", rmse)
print("R²:", r2)

reg_results.append(["EMPA", rmse, r2])



Linear Regression — EMPA
RMSE: 0.006408695957192188
R²: 0.9946366544183418


In [None]:
#
# 2. Laser Dataset
#
lr = LinearRegression()
lr.fit(X_laser_reg_train, y_laser_reg_train)

y_pred = lr.predict(X_laser_reg_test)

mse = mean_squared_error(y_laser_reg_test, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_laser_reg_test, y_pred)

print("\nLinear Regression — LASER")
print("RMSE:", rmse)
print("R²:", r2)

reg_results.append(["LASER", rmse, r2])



Linear Regression — LASER
RMSE: 0.1678276335271874
R²: -1.2856580539871176


In [None]:
#
# 3. Combined Dataset
#
lr = LinearRegression()
lr.fit(X_comb_reg_train, y_comb_reg_train)

y_pred = lr.predict(X_comb_reg_test)

mse = mean_squared_error(y_comb_reg_test, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_comb_reg_test, y_pred)

print("\nLinear Regression — COMBINED")
print("RMSE:", rmse)
print("R²:", r2)

reg_results.append(["COMBINED", rmse, r2])



Linear Regression — COMBINED
RMSE: 0.020225884509205774
R²: 0.966802959918767


In [None]:
#############################################################
#  Saving the Regression performance metrics to a csv file
#############################################################

reg_df = pd.DataFrame(
    reg_results,
    columns=["Dataset", "RMSE", "R2"]
)

reg_df.to_csv(REG_DIR / "linear_regression_summary.csv", index=False)

print("Linear regression summary saved.")

###############################################################
# End of Stage 2 - Training the Models
###############################################################

Linear regression summary saved.
