In [8]:
import numpy as np
import pandas as pd
from skimage.io import imread, imshow
!pip install periodictable
import periodictable

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from pathlib import Path
import re
import json



In [0]:
from sklearn.linear_model import LinearRegression

In [2]:
# Only if on Google Colab

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [5]:
standards_df = pd.read_csv('https://raw.githubusercontent.com/HackTheSolarSystem/MineralMapping/master/challenge_data/mineral_standards.csv')
standards_df.head(5)

Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,mineral
0,0,0,0,0,171,0,4,0,459,0,CaTiO3std15
1,0,0,0,0,148,3,2,0,462,1,CaTiO3std15
2,0,2,0,0,141,6,3,0,455,2,CaTiO3std15
3,1,2,2,0,122,6,3,0,502,0,CaTiO3std15
4,0,0,0,0,138,5,5,0,457,1,CaTiO3std15


In [7]:
# Get expected standards weights 

rows = []
for filename in standards_df['mineral'].unique():
  try:
    formula = re.match(r'([A-Za-z0-9]*).*(std)(.*)', filename).groups()[0]
    weights = periodictable.formula(formula).mass_fraction
    weights = dict((str(e), w) for e,w in weights.items())
    weights['mineral_formula'] = formula
    weights['mineral'] = filename
    rows.append(weights)
  except:
    print(filename)

# rutile
weights = periodictable.formula('TiO2').mass_fraction
weights = dict((str(e), w) for e,w in weights.items())
weights['mineral_formula'] = 'TiO2'
weights['mineral'] = 'rutile'
rows.append(weights)

# SCOlv
rows.append({
    'mineral': 'SCOlvstd6', 'mineral_formula': '???', 
    'Si': .1908, 'Fe': .0742, 'Mg': .298, 'Mn': .0011, 'Ca': .0002, 'Ni': .0029
})  
  
weights_df = pd.DataFrame.from_records(rows).fillna(0)
weights_df.columns = [str(i) + '_weight' if str(i)[0].isupper() else str(i) for i in weights_df.columns]


standards_df = standards_df.merge(weights_df, on='mineral')
standards_df.head()

rutile
SCOlvstd6


Unnamed: 0,Mg,Ni,Al,Fe,Ca,Cr,P,S,Ti,Si,...,Ca_weight,Fe_weight,Mg_weight,Mn_weight,Ni_weight,O_weight,S_weight,Si_weight,Ti_weight,mineral_formula
0,0,0,0,0,171,0,4,0,459,0,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
1,0,0,0,0,148,3,2,0,462,1,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
2,0,2,0,0,141,6,3,0,455,2,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
3,1,2,2,0,122,6,3,0,502,0,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3
4,0,0,0,0,138,5,5,0,457,1,...,0.294814,0.0,0.0,0.0,0.0,0.353075,0.0,0.0,0.35211,CaTiO3


In [36]:
elements = {}

for col in ['Mg', 'Ni', 'Al', 'Fe', 'Ca', 'Cr', 'P', 'S', 'Ti', 'Si']:
  try:
    x = standards_df['%s_weight' % col].reshape(-1,1)
    y = standards_df[col]
    
    m = LinearRegression()
    m.fit(x,y)
    print(col, m.coef_[0], m.intercept_)
    d = {
        'element': col,
        'coef': m.coef_[0],
        'intercept': m.intercept_,
        'std': standards_df[standards_df['%s_weight' % col] > .01][col].std(),
        'noise': standards_df[standards_df['%s_weight' % col] == 0][col].std()
    }
    elements[col] = d
  except:
    pass

Mg 840.1532511324668 0.1436064531510226
Ni 302.0864913360913 0.5339756727391531
Fe 311.4356452476751 -0.18536195985906545
Ca 461.2234598579972 0.28230528466584914
S 210.310447449412 0.01002271126765919
Ti 1279.045019042139 3.850835728870237
Si 1012.3924348641195 0.8699038385122613


  """
  """
  """
  """
  """
  """
  """


In [37]:
elements

{'Ca': {'coef': 461.2234598579972,
  'element': 'Ca',
  'intercept': 0.28230528466584914,
  'noise': 0.7839330547099527,
  'std': 11.257305019123955},
 'Fe': {'coef': 311.4356452476751,
  'element': 'Fe',
  'intercept': -0.18536195985906545,
  'noise': 1.1155958908410166,
  'std': 106.4856160349156},
 'Mg': {'coef': 840.1532511324668,
  'element': 'Mg',
  'intercept': 0.1436064531510226,
  'noise': 0.4296300849187811,
  'std': 13.785308695058726},
 'Ni': {'coef': 302.0864913360913,
  'element': 'Ni',
  'intercept': 0.5339756727391531,
  'noise': 1.329300610782946,
  'std': 52.85466500629178},
 'S': {'coef': 210.310447449412,
  'element': 'S',
  'intercept': 0.01002271126765919,
  'noise': 0.16686134572121558,
  'std': 8.823133419855987},
 'Si': {'coef': 1012.3924348641195,
  'element': 'Si',
  'intercept': 0.8699038385122613,
  'noise': 1.2343527334863502,
  'std': 12.817817389598126},
 'Ti': {'coef': 1279.045019042139,
  'element': 'Ti',
  'intercept': 3.850835728870237,
  'noise': 2.

In [0]:
#targets = ['']


#to_dict(periodictable.formula('NiS').mass_fraction)

targets = [
    {'type': 'formula', 'formula': 'NiS', 'name': 'Millerite'},
    {'type': 'formula', 'formula': 'FeS', 'name': 'Troilite'},
    {'type': 'formula', 'formula': 'FeNiS', 'name': 'Pentlandite'},
    {'type': 'variable', 'name': 'Olivine', 'formula': [
        {'type': 'choice', 'elements': [('Fe', (0, 1)), ('Mg', (0, 1))], 'quantity': 2},
        {'type': 'fixed', 'elements': 'SiO4'}
    ]},
    {'type': 'variable', 'name': 'Pyroxene', 'formula': [
        {'type': 'choice', 'elements': [('Fe', (0, 1)), ('Mg', (0, 1))], 'quantity': 1},
        {'type': 'fixed', 'elements': 'SiO3'}
    ]},
    {'type': 'variable', 'name': 'Taenite', 'formula': [
        {'type': 'choice', 'elements': [('Fe', (.5, .6)), ('Ni', (.4, .5))], 'quantity': 1},
    ]},
    {'type': 'variable', 'name': 'Kamacite', 'formula': [
        {'type': 'choice', 'elements': [('Fe', (.9, .9)), ('Ni', (.1, .1))], 'quantity': 1},
    ]}
    
]

In [48]:
periodictable.elements.Fe.mass, periodictable.elements.S.mass, periodictable.formula('FeS').mass_fraction, 55.845/(55.845 + 32.065)

(55.845,
 32.065,
 {Fe: 0.6352519622341031, S: 0.3647480377658969},
 0.6352519622341031)

In [73]:
np.random.uniform(.9,.9)

0.9

In [58]:
q = periodictable.formula('FeS')
q.mass_fraction

def get_formula_weights(formula):
  f = periodictable.formula(formula)
  mass = f.mass
  fractions = f.mass_fraction
  
  return dict([(str(e), mass*f) for e,f in fractions.items()])

get_formula_weights('FeS')

{'Fe': 55.845, 'S': 32.065}

In [75]:
def to_dict(d):
  return dict([(str(e), w) for e,w in d.items()])

def get_formula_weights(formula):
  f = periodictable.formula(formula)
  mass = f.mass
  fractions = f.mass_fraction
  
  return dict([(str(e), mass*f) for e,f in fractions.items()])

def get_formula_percents(formula):
  return to_dict(periodictable.formula(formula).mass_fraction)

def get_variable_percents(formula):
  weights = {}
  
  def add_weight(element, mass):
    if element in weights:
      weights[element] += mass
    else:
      weights[element] = mass
  
  for component in formula:
    if component['type'] == 'choice':
      elements = component['elements']
      interval = elements[0][1]
      frac = np.random.uniform(*interval)
      add_weight(
          elements[0][0],
          periodictable.formula(elements[0][0]).mass * frac * component['quantity']
      )
      add_weight(
          elements[1][0],
          periodictable.formula(elements[1][0]).mass * (1 - frac) * component['quantity']
      )
    elif component['type'] == 'fixed':
      for e,m in get_formula_weights(component['elements']).items():
        add_weight(e,m)
  
  total = sum(weights.values())
  
  return dict([(e, m/total) for e,m in weights.items()])
      

def simulate_mineral(name, weights):
  intensities = {'mineral': name}
  for e in elements.values():
    v = e['intercept']
    v += np.random.normal(scale=e['noise'])
    if e['element'] in weights:
      v += e['coef']*weights[e['element']]
      v += np.random.normal(scale=e['std'])
      
    v = int(np.clip(v, 0, None))
    intensities[e['element']] = v

  return intensities
  
for t in targets:
  if t['type'] == 'formula':
    print(simulate_mineral(t['name'], get_formula_percents(t['formula'])))
  elif t['type'] == 'variable':
    for i in range(5):
      print(simulate_mineral(t['name'], get_variable_percents(t['formula'])))


{'mineral': 'Millerite', 'Mg': 0, 'Ni': 239, 'Fe': 0, 'Ca': 0, 'S': 71, 'Ti': 4, 'Si': 2}
{'mineral': 'Troilite', 'Mg': 0, 'Ni': 1, 'Fe': 234, 'Ca': 0, 'S': 67, 'Ti': 5, 'Si': 0}
{'mineral': 'Pentlandite', 'Mg': 0, 'Ni': 142, 'Fe': 236, 'Ca': 0, 'S': 46, 'Ti': 7, 'Si': 0}
{'mineral': 'Olivine', 'Mg': 236, 'Ni': 0, 'Fe': 0, 'Ca': 0, 'S': 0, 'Ti': 5, 'Si': 218}
{'mineral': 'Olivine', 'Mg': 55, 'Ni': 0, 'Fe': 0, 'Ca': 0, 'S': 0, 'Ti': 1, 'Si': 160}
{'mineral': 'Olivine', 'Mg': 155, 'Ni': 2, 'Fe': 172, 'Ca': 0, 'S': 0, 'Ti': 1, 'Si': 152}
{'mineral': 'Olivine', 'Mg': 151, 'Ni': 0, 'Fe': 53, 'Ca': 0, 'S': 0, 'Ti': 0, 'Si': 157}
{'mineral': 'Olivine', 'Mg': 213, 'Ni': 0, 'Fe': 203, 'Ca': 0, 'S': 0, 'Ti': 0, 'Si': 147}
{'mineral': 'Pyroxene', 'Mg': 83, 'Ni': 0, 'Fe': 194, 'Ca': 0, 'S': 0, 'Ti': 4, 'Si': 248}
{'mineral': 'Pyroxene', 'Mg': 83, 'Ni': 0, 'Fe': 0, 'Ca': 0, 'S': 0, 'Ti': 0, 'Si': 246}
{'mineral': 'Pyroxene', 'Mg': 153, 'Ni': 0, 'Fe': 0, 'Ca': 1, 'S': 0, 'Ti': 2, 'Si': 292}
{'minera

In [0]:
samples = []

N = 10000
M = 10

for t in targets:
  if t['type'] == 'formula':
    for i in range(N):
      samples.append(simulate_mineral(t['name'], get_formula_percents(t['formula'])))
  elif t['type'] == 'variable':
    for i in range(int(N/M)):
      percents = get_variable_percents(t['formula'])
      for j in range(M):
        samples.append(simulate_mineral(t['name'], percents))

In [83]:
samples_df = pd.DataFrame.from_records(samples)
samples_df.head()

Unnamed: 0,Ca,Fe,Mg,Ni,S,Si,Ti,mineral
0,0,0,0,268,69,0,4,Millerite
1,0,0,0,109,74,0,4,Millerite
2,0,1,0,159,78,0,2,Millerite
3,0,0,0,323,76,0,3,Millerite
4,0,1,0,240,72,0,4,Millerite


In [0]:
samples_df.to_csv('simulated_mineral_samples.csv', index=False)

In [0]:
X = samples_df[['Ca', 'Fe', 'Mg', 'Ni', 'S', 'Si', 'Ti']].values
Y = samples_df['mineral']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2)

In [86]:
mod = SVC(kernel='linear')
mod.fit(X_train, Y_train)
(mod.predict(X_train) == Y_train).mean(), (mod.predict(X_test) == Y_test).mean()

KeyboardInterrupt: ignored

In [40]:
def simulate_mineral(name, weights):
  intensities = {'mineral': name}
  for e in elements.values():
    v = e['intercept']
    v += np.random.normal(scale=e['noise'])
    if e['element'] in weights:
      v += e['coef']*weights[e['element']]
      v += np.random.normal(scale=e['std'])
      
    v = int(np.clip(v, 0, None))
    intensities[e['element']] = v

  return intensities

simulate_mineral('NiS', to_dict(periodictable.formula('NiS').mass_fraction))

{'Ca': 0,
 'Fe': 0,
 'Mg': 0,
 'Ni': 280,
 'S': 69,
 'Si': 1,
 'Ti': 2,
 'mineral': 'NiS'}

{Ni: 0.6466993688738453, S: 0.35330063112615473}