# Loading the dataset

In [2]:
import pandas
import numpy as np

df = pandas.read_csv('data.csv', index_col='NUM')

In [3]:
del df["NAME"]
del df["COUNTRY"]

In [4]:
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, cross_val_score, cross_validate

data, test = train_test_split(
    df,
    test_size = 0.2,
    random_state = 5)

print(data.shape)
print(test.shape)

(41, 6)
(11, 6)


In [5]:
X = data.loc[:,['Mine Annual Production (Million Tonne)',
       'Stripping Ratio', 'Mill Annual Production (Thousand Tonne)',
       'Reserve Mean Grade % Cu EQU.', 'LOM']]
y = data["CAPEX US$ millions"]

test_X = data.loc[:,['Mine Annual Production (Million Tonne)',
       'Stripping Ratio', 'Mill Annual Production (Thousand Tonne)',
       'Reserve Mean Grade % Cu EQU.', 'LOM']]
test_y = data["CAPEX US$ millions"]

In [32]:
num_folds = 6

from sklearn.model_selection import ShuffleSplit

# K-Fold cross validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=6)

# Monte Carlo cross validation
random_splits = ShuffleSplit(n_splits=100, test_size=0.2)

for c in random_splits.split(data):
    print(c)


(array([ 3,  9, 39, 21, 26, 11, 16,  5, 24, 10,  2, 31,  0, 23, 15, 25, 20,
       34, 17,  4, 40, 32, 37, 33,  7,  8, 29, 18, 30, 36, 28,  1]), array([19,  6, 22, 38, 27, 14, 13, 12, 35]))
(array([14, 22, 28,  8,  0, 35, 29, 31,  5, 36, 25,  2, 19, 10, 11,  9, 38,
       15,  7, 34, 27, 33, 40, 24, 17,  1, 21, 18,  6, 16,  3, 12]), array([26, 13, 20, 39, 37, 32, 30, 23,  4]))
(array([13,  5,  4, 20, 19,  1, 24, 14, 34, 33, 32, 38, 28, 30, 26,  3,  9,
       18, 17, 36, 27, 16,  0,  2,  7, 29, 12, 40, 35, 37,  6, 11]), array([10, 31, 23, 25,  8, 21, 22, 39, 15]))
(array([ 4, 35,  1,  8, 26, 11,  0, 19, 29, 20, 28, 31, 21,  9,  2, 22, 16,
       38, 24, 36, 14, 13,  7, 17,  3, 40, 34, 27, 30, 18, 15, 10]), array([23, 33, 37, 32,  5,  6, 39, 12, 25]))
(array([38, 34,  7, 18,  9,  3, 37, 31,  0, 11,  6,  8,  5, 14, 19, 28, 13,
       23, 30,  4, 26, 27, 22, 24, 21,  1,  2, 32, 25, 33, 16, 35]), array([10, 12, 39, 40, 36, 29, 17, 20, 15]))
(array([ 8,  7,  9, 17, 34,  1, 29, 16, 24, 31, 19

# Metrics

In [7]:
# Metrics:
#   RMSE -> Root Mean Squared Error
#   R2 -> Coefficient of Determination
#   MAE -> Mean Absolute Error
#   APE -> Absolute Error

In [8]:
from sklearn import metrics

def RMSE(model, X, y):
    return metrics.root_mean_squared_error(y,model.predict(X))

def R2(model, X, y):
    return metrics.r2_score(y,model.predict(X))

def MAE(model, X, y):
    return metrics.mean_absolute_error(y,model.predict(X))

def APE(model, X, y):
    return metrics.mean_absolute_percentage_error(y,model.predict(X))


def all_scores(model, X, y):
    return {
        "RMSE": RMSE(model, X, y),
        "R2": R2(model, X, y),
        "MAE": MAE(model, X, y),
        "APE": APE(model, X, y)
    }



# Models

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from pyswarms.single.global_best import GlobalBestPSO

In [10]:
# average the scores for each model
# then use scores for the optimisation algorithms

In [11]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# ANN
mlp = MLPRegressor(max_iter=1000, hidden_layer_sizes=(100), random_state=1, early_stopping=False)
scores = cross_validate(mlp, X, y, cv=kf, scoring=all_scores, return_train_score=True)
print("Training Set")
print("RMSE",np.mean(scores["train_RMSE"]))
print("R2",np.mean(scores["train_R2"]))
print("MAE",np.mean(scores["train_MAE"]))
print("APE",np.mean(scores["train_APE"]))
print("Validation Set")
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))

print()
mlp = MLPRegressor(max_iter=1000, hidden_layer_sizes=(100), random_state=1, early_stopping=False)
scores = cross_validate(mlp, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
print("Training Set")
print("RMSE",np.mean(scores["train_RMSE"]))
print("R2",np.mean(scores["train_R2"]))
print("MAE",np.mean(scores["train_MAE"]))
print("APE",np.mean(scores["train_APE"]))
print("Validation Set")
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))


# Hyperparameters to optimise:
#   -max iterations
#   -number of hidden layers
#   -size of layers
#   -learning rate



Training Set
RMSE 919.7697105293172
R2 0.6545852492099224
MAE 654.1766823396114
APE 0.37924304271351206
Validation Set
RMSE 878.2003256651225
R2 0.22373541823229356
MAE 690.6393555505183
APE 0.3953575960967172

Training Set
RMSE 930.2105852191446
R2 0.6490401688949283
MAE 661.1314386551699
APE 0.3806507293222564
Validation Set
RMSE 902.3888277449996
R2 0.359912696357055
MAE 695.6608898847296
APE 0.42712387295881554


In [36]:
# RF
rf = RandomForestRegressor()
scores = cross_validate(rf, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
print(scores.keys())
print("Training Set")
print("RMSE",np.mean(scores["train_RMSE"]))
print("R2",np.mean(scores["train_R2"]))
print("MAE",np.mean(scores["train_MAE"]))
print("APE",np.mean(scores["train_APE"]))
print("Validation Set")
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))

# Hyperparameters to optimise:
#   -number of estimators
#   -max depth
#   -minimum samples per leaf

dict_keys(['fit_time', 'score_time', 'test_RMSE', 'train_RMSE', 'test_R2', 'train_R2', 'test_MAE', 'train_MAE', 'test_APE', 'train_APE'])
Training Set
RMSE 409.9903870924528
R2 0.929113594051547
MAE 294.942178125
APE 0.16682991887835055
Validation Set
RMSE 1157.8105025961754
R2 0.22127966585691283
MAE 861.8720555555556
APE 0.5059031118275349


In [13]:
# SVM
svr = SVR()

scores = cross_validate(svr, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
print("Training Set")
print("RMSE",np.mean(scores["train_RMSE"]))
print("R2",np.mean(scores["train_R2"]))
print("MAE",np.mean(scores["train_MAE"]))
print("APE",np.mean(scores["train_APE"]))
print("Validation Set")
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))


Training Set
RMSE 1680.9288783135519
R2 -0.1287830651351382
MAE 1083.4010553961516
APE 0.5465734183581592
Validation Set
RMSE 1464.7301208912936
R2 -0.1736565505317999
MAE 1001.5715218493958
APE 0.625793844102015


In [14]:
# CART Tree (decision tree)
cart = DecisionTreeRegressor()

scores = cross_validate(cart, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
print("Training Set")
print("RMSE",np.mean(scores["train_RMSE"]))
print("R2",np.mean(scores["train_R2"]))
print("MAE",np.mean(scores["train_MAE"]))
print("APE",np.mean(scores["train_APE"]))
print("Validation Set")
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))

print("Hyperparameters:", cart.get_params())

Training Set
RMSE 0.0
R2 1.0
MAE 0.0
APE 0.0
Validation Set
RMSE 1235.5948308094598
R2 -0.90469499480123
MAE 890.5833333333334
APE 0.47150777764072604
Hyperparameters: {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


# Hyperparameter Optimisation

In [21]:
def testANN(param):
    # max iterations
    # size of layers
    # number of hidden layers
    # learning rate (Scale??)
    mlp = MLPRegressor(max_iter=round(param[0]), hidden_layer_sizes=[round(param[1]) for _ in range(round(param[2]))], learning_rate_init=param[3],random_state=1, early_stopping=False)
    scores = cross_validate(mlp, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
    return np.mean(scores["test_APE"])-np.mean(scores["test_R2"])

def testRF(param):
    # number of estimators
    # max depth
    # minimum samples per leaf
    rf = RandomForestRegressor(n_estimators=round(param[0]), max_depth=round(param[1]), min_samples_leaf=round(param[2]))
    scores = cross_validate(rf, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
    return np.mean(scores["test_APE"])-np.mean(scores["test_R2"])

def testSVR(param):
    # C
    # epsilon?
    svr = SVR(C=param[0], epsilon=param[1])
    scores = cross_validate(svr, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
    return np.mean(scores["test_APE"])-np.mean(scores["test_R2"])

def testDT(param):#CART Tree
    # max depth
    # minimum samples per leaf
    dt = DecisionTreeRegressor(max_depth=round(param[0]), min_samples_leaf=round(param[1]))
    scores = cross_validate(dt, X, y, cv=random_splits, scoring=all_scores, return_train_score=True)
    return np.mean(scores["test_APE"])-np.mean(scores["test_R2"])


def optimiseModel(params, modelfunc=testRF):
    out = np.array([modelfunc(param) for param in params])
    return out

In [24]:

bounds = (np.array([10,  2,  1]),
          np.array([100, 15, 10]))
# bounds:
#   upper + lower bounds of parameter values to test
#   format -> (np array of upper bounds,
#              np array of lower bounds)

options = {'c1': 0.5, 'c2': 0.3, 'w': 0.9}# i think just keep these the same
optimizer = GlobalBestPSO(n_particles=10, dimensions=3, options=options, bounds=bounds)# dimensions = number of parameters to optimise

cost, pos = optimizer.optimize(optimiseModel, 100, verbose=True, modelfunc=testRF) #change this to optimise different models (e.g. testRF, testSVR, testDT)

print(cost)
print("optimal parameters: ",pos)




# --------------------- RF experimentation results ----------------------

# R2 (10 particles, 100 iter)
# 59   6   3
# 57  11   3

# R2 (10 particles, 1000 iter)
# 42  11   3

# R2+APE (10 particles, 500 iter)
# 49  11   2
# 45   9   4

# R2+APE (20 particles, 500 iter)
# 84   9   3
# 61  10   3

# R2+APE (20 particles, 100 iter)
# 84   9   3
# 60  12   2

# -> increased MC crossvalidation n_splits to increase consistency

# -----------------------------------------------------------------------




# could use multiprocessing to speed it up, but then can't use ipynb

2024-04-30 23:44:45,430 - pyswarms.single.global_best - INFO - Optimize for 100 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|██████████|100/100, best_cost=-0.15
2024-05-01 00:17:19,300 - pyswarms.single.global_best - INFO - Optimization finished | best cost: -0.15037073823592967, best pos: [59.98480861 12.29954544  1.81777187]


-0.15037073823592967
[59.98480861 12.29954544  1.81777187]


In [38]:
# NEXT: try plotting optimisation
from pyswarms.utils.plotters.plotters import plot_contour
from pyswarms.utils.plotters.formatters import Designer
import matplotlib.animation as animation
from matplotlib import pyplot as plt
from IPython.display import Image

print(len(optimizer.pos_history))
print(optimizer.pos_history[0].shape)

print(optimizer.pos_history[0][:5])

pos_history = []
for pos_list in optimizer.pos_history:
    pos_history.append(np.delete(pos_list,1,1))


print(len(pos_history))
print(pos_history[0].shape)
print(pos_history[0][:5])

#needs to be 2d array
anim = plot_contour(pos_history, designer=Designer(limits=[(10,100),(1,10)]))

anim.save('plot2.gif', writer='imagemagick', fps=10)


100
(20, 3)
[[67.51098057 11.64340192  3.58862353]
 [78.16407114  9.79862685  9.30253853]
 [51.16971792  6.16177102  3.44817495]
 [95.24126042  7.79798048  6.16330602]
 [21.6481137   2.4758367   4.85837461]]
100
(20, 2)
[[67.51098057  3.58862353]
 [78.16407114  9.30253853]
 [51.16971792  3.44817495]
 [95.24126042  6.16330602]
 [21.6481137   4.85837461]]


<IPython.core.display.Javascript object>

2024-05-01 00:40:39,612 - matplotlib.animation - INFO - Animation.save using <class 'matplotlib.animation.ImageMagickWriter'>
2024-05-01 00:40:39,613 - matplotlib.animation - INFO - MovieWriter._run: running command: 'C:\Program Files\ImageMagick-7.1.1-Q16-HDRI\magick.exe' -size 1000x800 -depth 8 -delay 10.0 -loop 0 rgba:- -layers OptimizePlus plot2.gif


# Testing

In [18]:
#use optimal hyperparameters to train model, then test

To add later on if time:  
-preprocessing (outliers, etc.)  
-ensemble methods (ANN + SVM)