In [1]:
from citrination_client import CitrinationClient
from citrination_client import PifSystemReturningQuery
from citrination_client import DatasetQuery
from citrination_client import DataQuery
from citrination_client import Filter

from pypif.pif import dumps
import json 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [26]:
import saxskit

from saxskit.saxskit.saxs_classify import SaxsClassifier
from saxskit.saxskit.saxs_regression import SaxsRegressor
from saxskit.saxskit.saxs_math import profile_spectrum
from saxskit.saxskit.saxs_models import get_data_from_Citrination

from os import listdir
from os.path import isfile, join

import sklearn

I created dataview 'unidentified yes/no' on Citrination that should predict 'unidentified' labels given 13 features (profile key).

In [3]:
with open("citrination_api_key_ssrl.txt", "r") as g: 
    api_key = g.readline()

a_key = api_key.strip()

client = CitrinationClient(site='https://slac.citrination.com',api_key=a_key ) 

# Using dataview 'profile_keys':

#### Make a prediction for a sample from user machine:

In [4]:
q_i = np.genfromtxt ('my_data/A_210C_0212114344_0001_dz_bgsub.csv', delimiter=",")
features = profile_spectrum(q_i)
features

OrderedDict([('Imax_over_Imean', 4.517864065692117),
             ('Imax_sharpness', 1.099622261523832),
             ('I_fluctuation', 0.013731779181704483),
             ('logI_fluctuation', 16.976297914971699),
             ('logI_max_over_std', 4.3019224595005117),
             ('r_fftIcentroid', 0.12748515104545091),
             ('r_fftImax', 0.0017857142857142857),
             ('q_Icentroid', 0.24748247141186733),
             ('q_logIcentroid', 0.22692825356325344),
             ('pearson_q', -0.79559437880058759),
             ('pearson_q2', -0.75112702457148206),
             ('pearson_expq', -0.78642690663881609),
             ('pearson_invexpq', 0.80268675204651729)])

We need to add "Property" before the names of features:

In [45]:
inputs = {}
for k,v in features.items():
    k = "Property " + k
    inputs[k] = v
inputs

{'Property I_fluctuation': 0.013731779181704483,
 'Property Imax_over_Imean': 4.517864065692117,
 'Property Imax_sharpness': 1.099622261523832,
 'Property logI_fluctuation': 16.976297914971699,
 'Property logI_max_over_std': 4.3019224595005117,
 'Property pearson_expq': -0.78642690663881609,
 'Property pearson_invexpq': 0.80268675204651729,
 'Property pearson_q': -0.79559437880058759,
 'Property pearson_q2': -0.75112702457148206,
 'Property q_Icentroid': 0.24748247141186733,
 'Property q_logIcentroid': 0.22692825356325344,
 'Property r_fftIcentroid': 0.12748515104545091,
 'Property r_fftImax': 0.0017857142857142857}

In [48]:
resp = client.predict("24", inputs)
prediction = resp['candidates'][0]['Property unidentified']
prediction

['0', 0.006533241221533759]

In [49]:
prediction = resp['candidates'][0]['Property guinier_porod']
prediction

['0', 0.04612312787841481]

In [50]:
prediction = resp['candidates'][0]['Property spherical_normal']
prediction

['1', 0.00940940308071245]

In [51]:
prediction = resp['candidates'][0]['Property diffraction_peaks']
prediction

['0', 0.006040956076299953]

## Make predictions for all our samples and compare with true labels:

In [7]:
data = get_data_from_Citrination(client, [1,15,16])
data.head()

Unnamed: 0,experiment_id,Imax_over_Imean,Imax_sharpness,I_fluctuation,logI_fluctuation,logI_max_over_std,r_fftIcentroid,r_fftImax,q_Icentroid,q_logIcentroid,...,guinier_porod,spherical_normal,diffraction_peaks,I0_floor,G_gp,rg_gp,D_gp,I0_sphere,r0_sphere,sigma_sphere
171,R1,18.8777,1.03372,0.00109968,4.19634,2.87545,0.107935,0.00185529,0.064653,-0.674134,...,0,1,0,0.185712,,,,1191.61,33.9117,0.0335728
42,R1,15.3204,2.53745,0.00278489,24.9187,2.97657,0.105759,0.00185529,0.0757829,-0.140669,...,0,1,1,,,,,0.0,,0.0
69,R1,18.7701,1.0313,0.00109659,5.24908,3.03462,0.107702,0.00185529,0.0649956,-0.258935,...,0,1,0,0.250425,,,,1394.91,33.6123,0.0336855
1643,Reaction_C,4.25667,1.05704,0.0187776,18.3502,5.05902,0.144492,0.0071599,0.300745,0.289633,...,1,0,0,7.73547e-17,1.81139,1.83064,4.0,,,
11,R1,1.28889,1.21086,0.0205457,36.719,9.75636,0.179186,0.00185529,0.322812,0.324378,...,0,0,0,,,,,,,


In [8]:
profile_keys = [\
    'Imax_over_Imean',\
    'Imax_sharpness',\
    'I_fluctuation',\
    'logI_fluctuation',\
    'logI_max_over_std',\
    'r_fftIcentroid',\
    'r_fftImax',\
    'q_Icentroid',\
    'q_logIcentroid',\
    'pearson_q',\
    'pearson_q2',\
    'pearson_expq',\
    'pearson_invexpq']

In [9]:
inputs = []

for i in range(data.shape[0]):
    sample = {}
    for k in profile_keys:
        sample["Property " + k] = data.iloc[i][k]
    inputs.append(sample)

### Using Citrination Models:

In [20]:
unidentified = []
guinier_porod = []
spherical_normal = []
diffraction_peaks = []

for i in range(data.shape[0]):
    r = client.predict("24", inputs[i])  # "24" is ID of dataview on Citrination 
    unidentified.append(r['candidates'][0]['Property unidentified'])
    guinier_porod.append(r['candidates'][0]['Property guinier_porod'])
    spherical_normal.append(r['candidates'][0]['Property spherical_normal'])
    diffraction_peaks.append(r['candidates'][0]['Property diffraction_peaks'])

In [21]:
unidentified =  list(map(int, np.array(unidentified)[ : , 0]))
guinier_porod =  list(map(int, np.array(guinier_porod)[ : , 0]))
spherical_normal =  list(map(int, np.array(spherical_normal)[ : , 0]))
diffraction_peaks =  list(map(int, np.array(diffraction_peaks)[ : , 0]))

In [27]:
sklearn.metrics.accuracy_score(data.unidentified, unidentified)

0.99075025693730734

In [23]:
sklearn.metrics.accuracy_score(data.guinier_porod, guinier_porod)

0.96402877697841727

In [24]:
sklearn.metrics.accuracy_score(data.spherical_normal, spherical_normal)

0.94964028776978415

In [25]:
sklearn.metrics.accuracy_score(data.diffraction_peaks, diffraction_peaks)

0.9928057553956835

### Using Saxs_kit Models:

In [14]:
from collections import OrderedDict

inputs2 = []

for i in range(data.shape[0]):
    sample = OrderedDict()
    for k in profile_keys:
        sample[k] = data.iloc[i][k]
    inputs2.append(sample)

In [36]:
m = SaxsClassifier()

saxskit_unidentified = []
saxskit_guinier_porod = []
saxskit_spherical_normal = []
saxskit_diffraction_peaks = []

for i in range(data.shape[0]):
    flags = m.run_classifier(inputs2[i])
    saxskit_unidentified.append(flags['unidentified'][0])
    if flags['unidentified'][0] == 0:
        saxskit_guinier_porod.append(flags['guinier_porod'][0])
        saxskit_spherical_normal.append(flags['spherical_normal'][0])
        saxskit_diffraction_peaks.append(flags['diffraction_peaks'][0])
    else:
        saxskit_guinier_porod.append(0)
        saxskit_spherical_normal.append(0)
        saxskit_diffraction_peaks.append(0)

In [38]:
sklearn.metrics.accuracy_score(data.unidentified, saxskit_unidentified)

0.99023638232271327

In [39]:
sklearn.metrics.accuracy_score(data.guinier_porod, saxskit_guinier_porod)

0.85611510791366907

In [40]:
sklearn.metrics.accuracy_score(data.spherical_normal, saxskit_spherical_normal)

0.99640287769784175

In [41]:
sklearn.metrics.accuracy_score(data.diffraction_peaks, saxskit_diffraction_peaks)

0.98715313463514898

### Accuracy of Citrination Models vs Saxs_kit Models

| model           | Citrination (training) | saxs_kit (training) | saxs_kit (keep two experiments out)
| ------------- | ------------- |----------------
| unidentified  | 0.99  | 0.99 | 0.98
| guinier_porod  | **0.96**  | 0.85 | 0.82
| spherical_normal | 0.94 | **0.99** | 0.98
| diffraction_peaks | 0.99 | 0.98 | 0.97

Unfortunately, we are not able to sets Citrination models using "keep two experiments out" approach. During the training of sasx_kit models we realized that this approach is very helpful to prevent overfitting, especially for "guinier_porod" labels. Usually, the samples from the same experiment are very similar and if we splitting the samples for training and testing randomly, there is a very hight chance that we have about the same samples in training and testing sets. In this case, the model works well for training and testing data, but perform badly on new data. Thus, using "keep two experiments out" approach gives us more realistic estimation of accuracy.

## Citrination models
## Using dataview r0_sphere

In [87]:
# we can predict r0_sphere only for samples with saxskit_spherical_normal = True
data_sphere = data[data.spherical_normal == 1]

In [86]:
data_sphere.shape

(1382, 32)

In [90]:
data_r0 = data_sphere[data_sphere.r0_sphere.isnull() == False]
data_r0.shape

(726, 32)

In [91]:
inputs = []

for i in range(data_r0.shape[0]):
    sample = {}
    for k in profile_keys:
        sample["Property " + k] = data_r0.iloc[i][k]
    inputs.append(sample)

In [100]:
r0_sphere = []

for i in range(data_r0.shape[0]):
    r = client.predict("27", inputs[i])  # "27" is ID of dataview on Citrination 
    r0_sphere.append(r['candidates'][0]['Property r0_sphere'])

In [101]:
r0_sphere =  list(map(float, np.array(r0_sphere)[ : , 0]))

In [102]:
label_std = data_r0.r0_sphere.std()

In [103]:
from sklearn.metrics import mean_absolute_error

scores = mean_absolute_error(r0_sphere, data_r0.r0_sphere)
scores/label_std

0.020377369240147946

## Using dataview sigma_shpere

In [106]:
additional_fatures = ['q_at_Iq4_min1',
        'pIq4_qwidth',
        'pI_qvertex',
        'pI_qwidth']

features = []
features.extend(profile_keys)
features.extend(additional_fatures)

In [107]:
inputs = []

for i in range(data_r0.shape[0]):
    sample = {}
    for k in features:
        sample["Property " + k] = data_r0.iloc[i][k]
    inputs.append(sample)

In [109]:
sigma_sphere = []

for i in range(data_r0.shape[0]):
    r = client.predict("28", inputs[i])  # "27" is ID of dataview on Citrination 
    sigma_sphere.append(r['candidates'][0]['Property sigma_sphere'])

In [110]:
sigma_sphere =  list(map(float, np.array(sigma_sphere)[ : , 0]))

In [111]:
label_std = data_r0.sigma_sphere.std()

In [112]:
scores = mean_absolute_error(sigma_sphere, data_r0.sigma_sphere)
scores/label_std

0.12633425680627983

## Using dataview rg_gp

In [113]:
data_rg = data[data.rg_gp.isnull() == False]
data_rg.shape

(518, 32)

In [114]:
additional_fatures = ['I0_over_Imean',
        'I0_curvature',
        'q_at_half_I0']

features = []
features.extend(profile_keys)
features.extend(additional_fatures)

In [115]:
inputs = []

for i in range(data_rg.shape[0]):
    sample = {}
    for k in features:
        sample["Property " + k] = data_rg.iloc[i][k]
    inputs.append(sample)

In [119]:
rg_gp = []

for i in range(data_rg.shape[0]):
    r = client.predict("29", inputs[i])  # "29" is ID of dataview on Citrination 
    rg_gp.append(r['candidates'][0]['Property rg_gp'])

In [120]:
rg_gp =  list(map(float, np.array(rg_gp)[ : , 0]))

In [121]:
label_std = data_rg.rg_gp.std()

In [122]:
scores = mean_absolute_error(rg_gp, data_rg.rg_gp)
scores/label_std

0.12304372434238151

### Accuracy of Citrination Models vs Saxs_kit Models

| model           | Citrination (training) | saxs_kit (training) | saxs_kit (keep one experiments out)
| ------------- | ------------- |----------------
| r0_sphere  | 0.02  | - | 0.14
| sigma_sphere  | 0.12  | - | 0.64
| rg_gp | 0.12 | - | 0.23