In [0]:
import numpy as np
import pandas as pd
from skimage.io import imread, imshow
import periodictable

from pathlib import Path
import re
import json

In [4]:
# Only if on Google Colab

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [58]:
df = pd.read_csv('https://raw.githubusercontent.com/HackTheSolarSystem/MineralMapping/master/challenge_data/mineral_standards.csv')
df.head(5)

Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,mineral
0,0,0,0,0,171,0,4,0,459,0,CaTiO3std15
1,0,0,0,0,148,3,2,0,462,1,CaTiO3std15
2,0,2,0,0,141,6,3,0,455,2,CaTiO3std15
3,1,2,2,0,122,6,3,0,502,0,CaTiO3std15
4,0,0,0,0,138,5,5,0,457,1,CaTiO3std15


In [59]:
dict((str(e), w) for e,w in periodictable.formula('TiO2').mass_fraction.items())

{'O': 0.40065710228908996, 'Ti': 0.5993428977109101}

In [54]:
rows = []
for filename in df['mineral'].unique():
  try:
    formula = re.match(r'([A-Za-z0-9]*).*(std)(.*)', filename).groups()[0]
    weights = periodictable.formula(formula).mass_fraction
    weights = dict((str(e), w) for e,w in weights.items())
    weights['mineral_formula'] = formula
    weights['mineral'] = filename
    rows.append(weights)
  except:
    print(filename)

# rutile
weights = periodictable.formula('TiO2').mass_fraction
weights = dict((str(e), w) for e,w in weights.items())
weights['mineral_formula'] = 'TiO2'
weights['mineral'] = 'rutile'
rows.append(weights)

# SCOlv
rows.append({
    'mineral': 'SCOlvstd6', 'mineral_formula': '???', 
    'Si': .1908, 'Fe': .0742, 'Mg': .298, 'Mn': .0011, 'Ca': .0002, 'Ni': .0029
})  
  
rows

rutile
SCOlvstd6


[{'Ca': 0.29481430479788623,
  'O': 0.3530754020796921,
  'Ti': 0.3521102931224217,
  'mineral': 'CaTiO3std15',
  'mineral_formula': 'CaTiO3'},
 {'Fe': 1.0, 'mineral': 'Fe-num2std9', 'mineral_formula': 'Fe'},
 {'Fe': 0.6352519622341031,
  'S': 0.3647480377658969,
  'mineral': 'FeSstd2',
  'mineral_formula': 'FeS'},
 {'Fe': 0.723591407862219,
  'O': 0.276408592137781,
  'mineral': 'Fe3O4std15',
  'mineral_formula': 'Fe3O4'},
 {'Ni': 1.0, 'mineral': 'Nistd9', 'mineral_formula': 'Ni'},
 {'Ni': 0.6466993688738453,
  'S': 0.35330063112615473,
  'mineral': 'NiSstd2',
  'mineral_formula': 'NiS'},
 {'O': 0.40065710228908996,
  'Ti': 0.5993428977109101,
  'mineral': 'rutile',
  'mineral_formula': 'TiO2'},
 {'Ca': 0.0002,
  'Fe': 0.0742,
  'Mg': 0.298,
  'Mn': 0.0011,
  'Ni': 0.0029,
  'Si': 0.1908,
  'mineral': 'SCOlvstd6',
  'mineral_formula': '???'}]

In [55]:
weights_df = pd.DataFrame.from_records(rows).fillna(0)
weights_df.columns = [str(i) + '_weight' if str(i)[0].isupper() else str(i) for i in weights_df.columns]
weights_df

Unnamed: 0,Ca_weight,Fe_weight,Mg_weight,Mn_weight,Ni_weight,O_weight,S_weight,Si_weight,Ti_weight,mineral,mineral_formula
0,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3std15,CaTiO3
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fe-num2std9,Fe
2,0.0,0.635252,0.0,0.0,0.0,0.0,0.364748,0.0,0.0,FeSstd2,FeS
3,0.0,0.723591,0.0,0.0,0.0,0.276409,0.0,0.0,0.0,Fe3O4std15,Fe3O4
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Nistd9,Ni
5,0.0,0.0,0.0,0.0,0.646699,0.0,0.353301,0.0,0.0,NiSstd2,NiS
6,0.0,0.0,0.0,0.0,0.0,0.400657,0.0,0.0,0.599343,rutile,TiO2
7,0.0002,0.0742,0.298,0.0011,0.0029,0.0,0.0,0.1908,0.0,SCOlvstd6,???


In [60]:
df = df.merge(weights_df, on='mineral')
df.head()

Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,...,Ca_weight,Fe_weight,Mg_weight,Mn_weight,Ni_weight,O_weight,S_weight,Si_weight,Ti_weight,mineral_formula
0,0,0,0,0,171,0,4,0,459,0,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
1,0,0,0,0,148,3,2,0,462,1,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
2,0,2,0,0,141,6,3,0,455,2,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
3,1,2,2,0,122,6,3,0,502,0,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
4,0,0,0,0,138,5,5,0,457,1,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3


In [61]:
df.columns

Index(['Mg', 'Ni', 'Al', 'Fe', 'Ca', 'Cr', 'P', 'S', 'Ti', 'Si', 'mineral',
       'Ca_weight', 'Fe_weight', 'Mg_weight', 'Mn_weight', 'Ni_weight',
       'O_weight', 'S_weight', 'Si_weight', 'Ti_weight', 'mineral_formula'],
      dtype='object')

In [0]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [0]:
X = df[['Mg', 'Ni', 'Al', 'Fe', 'Ca', 'Cr', 'P', 'S', 'Ti', 'Si']].values
Y = df['mineral']

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2)

In [86]:

#mod = RandomForestClassifier()
#mod = SGDClassifier()
mod = SVC(kernel='linear')
mod.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [87]:
(mod.predict(X_train) == Y_train).mean(), (mod.predict(X_test) == Y_test).mean()

(0.9999572046047845, 1.0)

In [85]:
Y_test[mod.predict(X_test) != Y_test]

13772     Fe3O4std15
13448     Fe3O4std15
9539         FeSstd2
2577     CaTiO3std15
5341     Fe-num2std9
11338        FeSstd2
1708     CaTiO3std15
15556         Nistd9
16361         Nistd9
17832        NiSstd2
23271         rutile
11409     Fe3O4std15
14076     Fe3O4std15
9216         FeSstd2
19665        NiSstd2
13570     Fe3O4std15
19011        NiSstd2
20527        NiSstd2
16216         Nistd9
23799         rutile
9591         FeSstd2
21325        NiSstd2
2239     CaTiO3std15
8505         FeSstd2
1050     CaTiO3std15
16052         Nistd9
16947        NiSstd2
16928        NiSstd2
8575         FeSstd2
4498     Fe-num2std9
            ...     
6580     Fe-num2std9
153      CaTiO3std15
1904     CaTiO3std15
24078         rutile
529      CaTiO3std15
683      CaTiO3std15
5736     Fe-num2std9
10526        FeSstd2
1638     CaTiO3std15
16470         Nistd9
3685     Fe-num2std9
1936     CaTiO3std15
16798         Nistd9
11658     Fe3O4std15
5210     Fe-num2std9
16834         Nistd9
12580     Fe3

In [77]:
df.loc[4602]

Mg                           0
Ni                           5
Al                           0
Fe                         271
Ca                           0
Cr                           6
P                            2
S                            0
Ti                           3
Si                           1
mineral            Fe-num2std9
Ca_weight                    0
Fe_weight                    1
Mg_weight                    0
Mn_weight                    0
Ni_weight                    0
O_weight                     0
S_weight                     0
Si_weight                    0
Ti_weight                    0
mineral_formula             Fe
Name: 4602, dtype: object

In [78]:
df[df['mineral_formula'] == 'Fe'].mean()

Mg             0.127520
Ni             1.346949
Al             0.471066
Fe           316.354543
Ca             0.398010
Cr             8.184080
P              1.607489
S              0.034826
Ti             4.253993
Si             1.859649
Ca_weight      0.000000
Fe_weight      1.000000
Mg_weight      0.000000
Mn_weight      0.000000
Ni_weight      0.000000
O_weight       0.000000
S_weight       0.000000
Si_weight      0.000000
Ti_weight      0.000000
dtype: float64