# Loading the dataset

In [23]:
import pandas
import numpy as np

df = pandas.read_csv('data.csv', index_col='NUM')

In [24]:
del df["NAME"]
del df["COUNTRY"]

In [72]:
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, cross_val_score, cross_validate

data, test = train_test_split(
    df,
    test_size = 0.2,
    random_state = 5)

print(data.shape)
print(test.shape)

(41, 6)
(11, 6)


In [73]:
X = data.loc[:,['Mine Annual Production (Million Tonne)',
       'Stripping Ratio', 'Mill Annual Production (Thousand Tonne)',
       'Reserve Mean Grade % Cu EQU.', 'LOM']]
y = data["CAPEX US$ millions"]

test_X = data.loc[:,['Mine Annual Production (Million Tonne)',
       'Stripping Ratio', 'Mill Annual Production (Thousand Tonne)',
       'Reserve Mean Grade % Cu EQU.', 'LOM']]
test_y = data["CAPEX US$ millions"]

In [113]:
num_folds = 6

from sklearn.model_selection import ShuffleSplit

# K-Fold cross validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=6)

# Monte Carlo cross validation
random_splits = ShuffleSplit(n_splits=40, test_size=0.2)

for c in random_splits.split(data):
    print(c)




(array([ 1, 13,  6, 39,  4, 34, 26, 10, 30, 23, 38, 25, 36, 17,  8, 24, 18,
       19, 27, 32, 21, 28,  3, 12,  2,  9,  0,  7, 11, 35, 33, 37]), array([22, 14, 20, 29, 16,  5, 40, 31, 15]))
(array([11,  1, 24, 18, 31, 36, 39, 30,  2, 38, 12,  9, 32, 37,  0, 33,  8,
        3,  7, 15, 17, 40, 28, 35,  6, 26, 13,  5, 25,  4, 20, 27]), array([29, 16, 14, 10, 23, 34, 21, 19, 22]))
(array([38, 39,  0, 22, 13,  7,  5, 17, 40, 25,  6, 11, 10, 21, 30, 12, 29,
       34,  1,  4, 15, 32, 33, 16,  3, 31, 24,  9, 28, 35, 27, 14]), array([37, 18, 36, 19, 26, 20, 23,  8,  2]))
(array([17, 34, 28, 27,  5, 22, 25,  8, 33, 13,  9, 10, 15, 16, 35,  0,  1,
       31, 21, 14, 12,  2,  7, 38, 18, 36, 32,  6, 39, 26, 11, 37]), array([23, 20, 30,  3, 24, 19, 29,  4, 40]))
(array([ 1, 23, 21, 26, 18, 16, 25, 39, 24,  6,  8, 30, 33,  0, 32, 36, 35,
        2,  5,  3, 22, 14, 10,  7, 15, 13, 20, 28, 40, 31, 34, 38]), array([17, 29, 11,  9, 37,  4, 27, 19, 12]))
(array([12, 19, 21, 16, 37, 15,  2, 30, 10, 32, 22

# Metrics

In [None]:
# Metrics:
#   RMSE -> Root Mean Squared Error
#   R2 -> Coefficient of Determination
#   MAE -> Mean Absolute Error
#   APE -> Absolute Error

In [106]:
from sklearn import metrics

def RMSE(model, X, y):
    return metrics.root_mean_squared_error(y,model.predict(X))

def R2(model, X, y):
    return metrics.r2_score(y,model.predict(X))

def MAE(model, X, y):
    return metrics.mean_absolute_error(y,model.predict(X))

def APE(model, X, y):
    return metrics.mean_absolute_percentage_error(y,model.predict(X))


def all_scores(model, X, y):
    return {
        "RMSE": RMSE(model, X, y),
        "R2": R2(model, X, y),
        "MAE": MAE(model, X, y),
        "APE": APE(model, X, y)
    }



# Training, Validation, Optimisation

In [103]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor


In [116]:
# RF
rf = RandomForestRegressor()
scores = cross_validate(rf, X, y, cv=kf, scoring=all_scores)
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))

print()
rf = RandomForestRegressor()
scores = cross_validate(rf, X, y, cv=random_splits, scoring=all_scores)
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))


# average the scores for the model
# then use scores for the optimisation algorithms

RMSE 962.5522931395566
R2 -0.02112674944857823
MAE 713.7876587301588
APE 0.4046233844160962

RMSE 1141.7081792163547
R2 0.35469199987395583
MAE 836.5781388888888
APE 0.47315223413065316


In [117]:
# ANN
mlp = MLPRegressor(max_iter=500, random_state=1)
scores = cross_validate(rf, X, y, cv=random_splits, scoring=all_scores)
print("RMSE",np.mean(scores["test_RMSE"]))
print("R2",np.mean(scores["test_R2"]))
print("MAE",np.mean(scores["test_MAE"]))
print("APE",np.mean(scores["test_APE"]))

RMSE 988.980514181296
R2 0.23908870446559752
MAE 757.3432222222222
APE 0.4317066674296779


In [None]:
# SVM

In [None]:
# CART Tree (decision tree)

In [None]:
# Also look at what hyperparameters we can optimise for each.

# Testing

In [None]:
#use optimal hyperparameters to train model, then test

To add later on if time:  
-preprocessing (outliers, etc.)  
-ensemble methods (ANN + SVM)