# Machine Learning Model for Efficiency in Gold Mining

## Project Overview

In [107]:
# Import Standard Libraries
import warnings

# Import Third Party Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Prepare Data

In [108]:
def load_data(file_name):
    try: # Try to load the data locally from the data folder
        data = pd.read_csv(f'data/{file_name}')
    except: # Read the data from the TripTen Hub
        data = pd.read_csv(f'/datasets/{file_name}')
    
    return data

In [109]:
# Load the data
data_full = load_data('gold_recovery_full.csv')
data_train = load_data('gold_recovery_train.csv')
data_test = load_data('gold_recovery_test.csv')

# Name the dataframes
data_full.name = 'data_full'
data_train.name = 'data_train'
data_test.name = 'data_test'


In [110]:
# Define the data sets
data_sets = [data_full, data_train, data_test]

# Define the target columns
target_columns = ['rougher.output.recovery', 'final.output.recovery']

In [111]:
# Convert the date columns to datetime
for data in data_sets:
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d %H:%M:%S')

## Check that recovery is calculated correctly.  
Using the training set, calculate recovery for the rougher.output.recovery feature. Find the MAE between your calculations and the feature values. Provide findings.

Recovery = (share of gold in the concentrate right after flotation * (share of gold in the feed before flotation - share of gold in the rougher tails right after flotation)) /
(share of gold in feed before flotation * (share of gold in concentrate right after flotation - share of gold in the rougher tails right after floatation))  
\* 100

--------
Recovery = (rougher.output.concentrate_au * (rougher.input.feed_au - rougher.output.tail_au)) /  
(rougher.input.feed_au * (rougher.output.concentrate_au - rougher.output.tail_au))  
\* 100

In [112]:
def calc_rough_recovery(row):

    C = row['rougher.output.concentrate_au']
    F = row['rougher.input.feed_au']
    T = row['rougher.output.tail_au']
    recovery = ((C*(F-T))/(F*(C-T)))*100

    return recovery

In [113]:
def compare_rough_recovery(df):
    
    # Narrow the df to the necessary columns
    df = df[['rougher.output.concentrate_au', 'rougher.input.feed_au', 'rougher.output.tail_au', 'rougher.output.recovery']]
    
    
    # Get count of rows with NaN values
    num_rows_with_nan = df.isnull().any(axis=1).sum()
    if num_rows_with_nan > 0:
        warnings.warn(f'There are {num_rows_with_nan} rows with NaN values!')
        
        # Drop rows with NaN values
        df = df.dropna()
    
    # Calculate the recovery
    df.loc[:, 'calculated_recovery'] = df.apply(calc_rough_recovery, axis=1)
    
    # Calculate the mean absolute error
    mae = np.abs(df['rougher.output.recovery'] - df['calculated_recovery']).mean()
    
    return mae


In [114]:
print(f'The MAE for the rougher.output.recovery column in the {data_train.name} dataframe is {compare_rough_recovery(data_train)}')

The MAE for the rougher.output.recovery column in the data_train dataframe is 9.210911277458828e-15




The MAE between the recorded value for rougher.output.recovery and the calculated amount is very small, but there a lot of missing values in the dataset still. I'd like to fill these missing values and then check again.

## Analyze the features not available in the test set.
What are these parameters?  
What is their type?

In [115]:
# Print column count for each dataset
for df in data_sets:
    print(f'{df.name} column count: {len(df.columns)}')

data_full column count: 87
data_train column count: 87
data_test column count: 53


The test dataset is missing 34 columns.

In [116]:
# Assert that the full and train datasets have the same columns
assert data_full.columns.tolist() == data_train.columns.tolist()

# Assert that data_full has the same number of rows as data_train + data_test
assert data_full.shape[0] == data_train.shape[0] + data_test.shape[0]

In [117]:
# Find the columns that are missing in the test dataset
missing_cols = data_train.columns.difference(data_test.columns)

# Print the types of the columns that are missing in the test dataset
for col in missing_cols:
    print(f'{col} type: {data_train[col].dtype}')

final.output.concentrate_ag type: float64
final.output.concentrate_au type: float64
final.output.concentrate_pb type: float64
final.output.concentrate_sol type: float64
final.output.recovery type: float64
final.output.tail_ag type: float64
final.output.tail_au type: float64
final.output.tail_pb type: float64
final.output.tail_sol type: float64
primary_cleaner.output.concentrate_ag type: float64
primary_cleaner.output.concentrate_au type: float64
primary_cleaner.output.concentrate_pb type: float64
primary_cleaner.output.concentrate_sol type: float64
primary_cleaner.output.tail_ag type: float64
primary_cleaner.output.tail_au type: float64
primary_cleaner.output.tail_pb type: float64
primary_cleaner.output.tail_sol type: float64
rougher.calculation.au_pb_ratio type: float64
rougher.calculation.floatbank10_sulfate_to_au_feed type: float64
rougher.calculation.floatbank11_sulfate_to_au_feed type: float64
rougher.calculation.sulfate_to_au_concentrate type: float64
rougher.output.concentrate_a

In addition to the expected missing target values (rougher.output.recovery and final.output.recovery), the test data set is missing 32 other columns. These values are not known at the time of prediction, and should therefor be dropped from the training data set.

In [118]:
# print missing value count for each data set
for df in data_sets:
    print(f'{df.name} missing value count:')
    print(df.isnull().sum().sum())

data_full missing value count:
36587
data_train missing value count:
30320
data_test missing value count:
2360


## Analyze the Data

In [119]:
# Print the min and max date for each dataset
for df in data_sets:
    print(f'{df.name} min date: {df["date"].min()}, max date: {df["date"].max()}')

data_full min date: 2016-01-15 00:00:00, max date: 2018-08-18 10:59:59
data_train min date: 2016-01-15 00:00:00, max date: 2018-08-18 10:59:59
data_test min date: 2016-09-01 00:59:59, max date: 2017-12-31 23:59:59
