In [75]:
import numpy as np
import pandas as pd
from skimage.io import imread, imshow
import periodictable

from pathlib import Path
import re
import json

In [76]:
# Only if on Google Colab

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

ModuleNotFoundError: No module named 'google'

In [92]:
df = pd.read_csv('https://raw.githubusercontent.com/HackTheSolarSystem/MineralMapping/master/challenge_data/mineral_standards.csv')
df.head(5)

Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,mineral
0,0,0,0,0,171,0,4,0,459,0,CaTiO3std15
1,0,0,0,0,148,3,2,0,462,1,CaTiO3std15
2,0,2,0,0,141,6,3,0,455,2,CaTiO3std15
3,1,2,2,0,122,6,3,0,502,0,CaTiO3std15
4,0,0,0,0,138,5,5,0,457,1,CaTiO3std15


In [93]:
coeff_df = pd.read_csv('https://raw.githubusercontent.com/HackTheSolarSystem/MineralMapping/master/challenge_data/weight_proportion_coefficients.csv')
coeffs = {}
for row in coeff_df.to_dict('records'):
    coeffs[row["element"]] = row["coefficient"]
coeffs

{'Mg': 0.0011860614084520178,
 'Ni': 0.003291171017028648,
 'Fe': 0.003189374388342769,
 'Ca': 0.0021508137421989873,
 'S': 0.004666715452031336,
 'Ti': 0.0007805407476003858,
 'Si': 0.000982449721881478}

In [96]:
rows = []
for filename in df['mineral'].unique():
  try:
    formula = re.match(r'([A-Za-z0-9]*).*(std)(.*)', filename).groups()[0]
    weights = periodictable.formula(formula).mass_fraction
    weights = dict((str(e), w/coeffs.get(str(e), 1)) for e,w in weights.items())
    weights['mineral_formula'] = formula
    weights['mineral'] = filename
    rows.append(weights)
  except:
    print(filename)

# rutile
weights = periodictable.formula('TiO2').mass_fraction
weights = dict((str(e), w/coeffs.get(str(e), 1)) for e,w in weights.items())
weights['mineral_formula'] = 'TiO2'
weights['mineral'] = 'rutile'
rows.append(weights)

# SCOlv
rows.append({
    'mineral': 'SCOlvstd6', 'mineral_formula': '???', 
    'Si': .1908, 'Fe': .0742/coeffs["Fe"], 'Mg': .298/coeffs["Mg"], 'Mn': .0011, 'Ca': .0002/coeffs["Ca"], 'Ni': .0029/coeffs["Ni"]
})  
  
rows

rutile
SCOlvstd6


[{'Ca': 137.0710531617065,
  'Ti': 451.11071293191725,
  'O': 0.3530754020796921,
  'mineral_formula': 'CaTiO3',
  'mineral': 'CaTiO3std15'},
 {'Fe': 313.5411144126012, 'mineral_formula': 'Fe', 'mineral': 'Fe-num2std9'},
 {'Fe': 199.17760817167232,
  'S': 78.15947672728338,
  'mineral_formula': 'FeS',
  'mineral': 'FeSstd2'},
 {'Fe': 226.87565640050315,
  'O': 0.276408592137781,
  'mineral_formula': 'Fe3O4',
  'mineral': 'Fe3O4std15'},
 {'Ni': 303.84322018696713, 'mineral_formula': 'Ni', 'mineral': 'Nistd9'},
 {'Ni': 196.49521873150846,
  'S': 75.7064866623418,
  'mineral_formula': 'NiS',
  'mineral': 'NiSstd2'},
 {'Ti': 767.8560018211326,
  'O': 0.40065710228908996,
  'mineral_formula': 'TiO2',
  'mineral': 'rutile'},
 {'mineral': 'SCOlvstd6',
  'mineral_formula': '???',
  'Si': 0.1908,
  'Fe': 23.264750689415006,
  'Mg': 251.2517462219205,
  'Mn': 0.0011,
  'Ca': 0.09298806125142219,
  'Ni': 0.8811453385422046}]

In [97]:
weights_df = pd.DataFrame.from_records(rows).fillna(0)
weights_df.columns = [str(i) + '_weight' if str(i)[0].isupper() else str(i) for i in weights_df.columns]
weights_df

Unnamed: 0,Ca_weight,Fe_weight,Mg_weight,Mn_weight,Ni_weight,O_weight,S_weight,Si_weight,Ti_weight,mineral,mineral_formula
0,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3std15,CaTiO3
1,0.0,313.541114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fe-num2std9,Fe
2,0.0,199.177608,0.0,0.0,0.0,0.0,78.159477,0.0,0.0,FeSstd2,FeS
3,0.0,226.875656,0.0,0.0,0.0,0.276409,0.0,0.0,0.0,Fe3O4std15,Fe3O4
4,0.0,0.0,0.0,0.0,303.84322,0.0,0.0,0.0,0.0,Nistd9,Ni
5,0.0,0.0,0.0,0.0,196.495219,0.0,75.706487,0.0,0.0,NiSstd2,NiS
6,0.0,0.0,0.0,0.0,0.0,0.400657,0.0,0.0,767.856002,rutile,TiO2
7,0.092988,23.264751,251.251746,0.0011,0.881145,0.0,0.0,0.1908,0.0,SCOlvstd6,???


In [98]:
df = df.merge(weights_df, on='mineral')
df.head()

Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,...,Ca_weight,Fe_weight,Mg_weight,Mn_weight,Ni_weight,O_weight,S_weight,Si_weight,Ti_weight,mineral_formula
0,0,0,0,0,171,0,4,0,459,0,...,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3
1,0,0,0,0,148,3,2,0,462,1,...,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3
2,0,2,0,0,141,6,3,0,455,2,...,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3
3,1,2,2,0,122,6,3,0,502,0,...,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3
4,0,0,0,0,138,5,5,0,457,1,...,137.071053,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,451.110713,CaTiO3


In [99]:
df.columns

Index(['Mg', 'Ni', 'Al', 'Fe', 'Ca', 'Cr', 'P', 'S', 'Ti', 'Si', 'mineral',
       'Ca_weight', 'Fe_weight', 'Mg_weight', 'Mn_weight', 'Ni_weight',
       'O_weight', 'S_weight', 'Si_weight', 'Ti_weight', 'mineral_formula'],
      dtype='object')

In [100]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [118]:
X = df[['Ca', 'Fe', 'Mg', 'Ni', 'S', 'Si', 'Ti']].values
Y = df['mineral']

X_train = weights_df[['Ca_weight', 'Fe_weight', 'Mg_weight', 'Ni_weight', 'S_weight', 'Si_weight', 'Ti_weight']].values
Y_train = weights_df['mineral']

_, X_test, _, Y_test = train_test_split(X, Y, test_size=.2)

X_test, Y_test

(array([[  0,   0,   0, ...,   0,   0, 734],
        [  0,   2,   0, ...,   0,   0, 774],
        [  0, 350,   0, ...,   0,   2,   2],
        ...,
        [  0,   0,   0, ...,   0,   0, 775],
        [  0, 218,   0, ...,   0,   0,   5],
        [  0,   2,   0, ...,  69,   0,   4]]), 24624         rutile
 23792         rutile
 4122     Fe-num2std9
 12122     Fe3O4std15
 232      CaTiO3std15
 20561        NiSstd2
 12953     Fe3O4std15
 28420      SCOlvstd6
 7162     Fe-num2std9
 9109         FeSstd2
 17089        NiSstd2
 12397     Fe3O4std15
 13709     Fe3O4std15
 27889      SCOlvstd6
 27827      SCOlvstd6
 14260     Fe3O4std15
 3714     Fe-num2std9
 1613     CaTiO3std15
 12228     Fe3O4std15
 10035        FeSstd2
 26649      SCOlvstd6
 15138     Fe3O4std15
 23037         rutile
 16485         Nistd9
 9223         FeSstd2
 4455     Fe-num2std9
 1022     CaTiO3std15
 27725      SCOlvstd6
 6264     Fe-num2std9
 18473        NiSstd2
             ...     
 26485      SCOlvstd6
 14515     F

In [119]:
mod = KNeighborsClassifier(n_neighbors=len(X_train), weights="distance")
mod.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=8, p=2,
           weights='distance')

In [120]:
Y_hat_train = mod.predict(X_train)
Y_hat_test = mod.predict(X_test)
train_acc = (Y_hat_train == Y_train).mean()
test_acc = (Y_hat_test == Y_test).mean()

train_acc, test_acc

(1.0, 0.999657651489216)