### Getting data from Citrination

In [193]:
from citrination_client import CitrinationClient
from citrination_client import PifQuery
from pypif.pif import dumps
import json 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import yaml
from sklearn import preprocessing
from sklearn import linear_model
import sklearn

In [194]:
#client = CitrinationClient(site='my_site',api_key='my_key' ) 

In [196]:
list_of_datasets = [1,15,16]

In [197]:
features = ['q_Imax', 'Imax_over_Imean', 'Imax_sharpness','logI_fluctuation', 'logI_max_over_std']

In [199]:
# return dataframe with features and labels 
def get_data_from_Citrination(client, dataset_id_list):
    df = pd.DataFrame(columns= [[ 'q_Imax', 'Imax_over_Imean', 'Imax_sharpness','logI_fluctuation', 'logI_max_over_std', 'bad_data', 'form', 'precursor', 'structure']])
    for dataset in dataset_id_list:
        query_dataset = PifQuery(include_datasets=[dataset])
        query_result = client.search(query_dataset)
        pifs = [x.system for x in query_result.hits]
        for line in pifs: # every line of pifs is one sample; we need to extract labels and features from it
            try:
                my_str = dumps(line)
                obj = json.loads(my_str) # to transform the string to dictionary
                
                # default values for labels
                bad_data = False
                form = False
                precursor = False
                structure = False
                    
                for pr in obj['properties']:

                    # extract features                    
                    if pr['name'] == 'q_Imax':
                        q_Imax = np.float32(pr['scalars'][0]['value'])
                    if pr['name'] == 'Imax_over_Imean':
                        Imax_over_Imean = np.float32(pr['scalars'][0]['value'])
                    if pr['name'] == 'Imax_sharpness':
                        Imax_sharpness = np.float32(pr['scalars'][0]['value'])
                    if pr['name'] == 'logI_fluctuation':
                        logI_fluctuation = np.float32(pr['scalars'][0]['value'])
                    if pr['name'] == 'logI_max_over_std':
                        logI_max_over_std = np.float32(pr['scalars'][0]['value'])

                    # extract labels
                    if pr['name'] == 'bad_data':
                        bad_data = True
                        continue
                    if pr['name'] == 'form_factor_scattering':
                        form = True
                    if pr['name'] == 'diffraction_peaks':
                        structure = True
                    if pr['name'] == 'precursor scattering':
                        precursor = True

                df.loc[df.shape[0]] = [q_Imax, Imax_over_Imean, Imax_sharpness, logI_fluctuation, 
                                           logI_max_over_std, bad_data, form, precursor, structure]
            except:
                # May be in PAWS we need to put a custom exeption here
                my_str = dumps(line)
                obj = json.loads(my_str) # to transform the string to dictionary
                print(obj)
                continue
                                 
    return df.convert_objects(convert_numeric=True)

In [200]:
d = get_data_from_Citrination(client, list_of_datasets)

In [201]:
d.head()

Unnamed: 0,q_Imax,Imax_over_Imean,Imax_sharpness,logI_fluctuation,logI_max_over_std,bad_data,form,precursor,structure
0,0.488538,1.312717,1.271242,7.916098,6.26831,True,False,False,False
1,0.275225,1.234932,1.132456,9.904497,5.549794,True,False,False,False
2,0.357429,1.275898,1.166689,5.827428,3.660412,True,False,False,False
3,0.336618,1.240092,1.143083,7.461594,4.006636,True,False,False,False
4,0.22736,1.254006,1.22955,6.997641,5.272323,True,False,False,False


In [202]:
d.shape

(1750, 9)

In [203]:
shuffled_rows = np.random.permutation(d.index)
data = d.loc[shuffled_rows]

In [204]:
# we need to save the version of sklearn to use in PAWS
current_version = list(map(int,sklearn.__version__.split('.')))
major,minor,patch = current_version
current_version

[0, 19, 0]

In [205]:
# I am saving version of sklearn with all scalers and models. 
# Then I will dump them into a yaml file that will be used in PAWS app.
scalers = {} 
models = {}
scalers_and_models = {'version':current_version, 'scalers' : scalers, 'models': models}

### Bad Data model

In [224]:
# to train the model on all avalible data 
scaler = preprocessing.StandardScaler()
scaler.fit(data[features])
log = linear_model.SGDClassifier(alpha= 0.001,loss= 'log', l1_ratio= 0.95, penalty= 'elasticnet')
log.fit(scaler.transform(data[features]), data['bad_data'])

# save the scaler and model
scalers['bad_data'] = scaler.__dict__
models['bad_data'] = log.__dict__

In [225]:
score = log.score(scaler.transform(data[features]), data['bad_data']) # here I test on the same data just to be sure the model works
print("bad data score: ", score)

bad data score:  0.982857142857


In [226]:
#now we need only "good" data
data_good = data[data['bad_data']==False]

In [227]:
bad_data_model = log
bad_data_scaler = scaler

### Form Scattering model

In [247]:
# to train the model on all avalible data 
scaler = preprocessing.StandardScaler()
scaler.fit(data_good[features])
log = linear_model.SGDClassifier(alpha= 0.001,loss= 'log', penalty= 'none')
log.fit(scaler.transform(data_good[features]), data_good['form'])

# save the scaler and model
scalers['form_factor_scattering'] = scaler.__dict__
models['form_factor_scattering'] = log.__dict__

In [248]:
score = log.score(scaler.transform(data[features]), data['form']) # here I test on the same data just to be sure the model works
print("form score: ", score)

form score:  0.987428571429


In [249]:
form_model = log
form_scaler = scaler

### Precursor Scattering model

In [231]:
# to train the model on all avalible data 
scaler = preprocessing.StandardScaler()
scaler.fit(data_good[features])
log = linear_model.SGDClassifier(alpha= 0.01,loss= 'log', l1_ratio= 0.95, penalty= 'l1')
log.fit(scaler.transform(data_good[features]), data_good['precursor'])

# save the scaler and model
scalers['precursor_scattering'] = scaler.__dict__
models['precursor_scattering'] = log.__dict__

In [232]:
score = log.score(scaler.transform(data[features]), data['precursor']) # here I test on the same data just to be sure the model works
print("precursor score: ", score)

precursor score:  0.756571428571


In [233]:
precursor_model = log
precursor_scaler = scaler

### Diffraction Peaks model

In [234]:
# to train the model on all avalible data 
scaler = preprocessing.StandardScaler()
scaler.fit(data_good[features])
log = linear_model.SGDClassifier(alpha= 0.001,loss= 'log', penalty= 'l1')
log.fit(scaler.transform(data_good[features]), data_good['structure'])

# save the scaler and model
scalers['diffraction_peaks'] = scaler.__dict__
models['diffraction_peaks'] = log.__dict__

In [235]:
score = log.score(scaler.transform(data[features]), data['structure']) # here I test on the same data just to be sure the model works
print("structure score: ", score)

structure score:  0.950857142857


In [236]:
structure_model = log
structure_scaler = scaler

In [237]:
with open('scalers_and_models.yml', 'w') as yaml_file:
    yaml.dump(scalers_and_models, yaml_file)

In [238]:
with open('scalers_and_models.yml') as info:
      s_and_m = yaml.load(info)

In [239]:
s_and_m

{'models': {'bad_data': {'C': 1.0,
   '_expanded_class_weight': array([ 1.,  1.]),
   'alpha': 0.001,
   'average': False,
   'class_weight': None,
   'classes_': array([False,  True], dtype=bool),
   'coef_': array([[  1.80233753, -11.99343183,   5.52984411,   1.04371033,
            -0.14865201]]),
   'epsilon': 0.1,
   'eta0': 0.0,
   'fit_intercept': True,
   'intercept_': array([-12.72081737]),
   'l1_ratio': 0.95,
   'learning_rate': 'optimal',
   'loss': 'log',
   'loss_function_': <sklearn.linear_model.sgd_fast.Log at 0x1a14835f78>,
   'max_iter': 5,
   'n_iter_': 5,
   'n_jobs': 1,
   'penalty': 'elasticnet',
   'power_t': 0.5,
   'random_state': None,
   'shuffle': True,
   't_': 8751.0,
   'tol': None,
   'verbose': 0,
   'warm_start': False},
  'diffraction_peaks': {'C': 1.0,
   '_expanded_class_weight': array([ 1.,  1.]),
   'alpha': 0.001,
   'average': False,
   'class_weight': None,
   'classes_': array([False,  True], dtype=bool),
   'coef_': array([[ -7.93713426,   9.

### Updating the scalers and models

In [222]:
list_of_datasets_with_new_data = [16]

In [223]:
new_data = get_data_from_Citrination(client, list_of_datasets_with_new_data)
shuffled_rows = np.random.permutation(new_data.index)
new_data = new_data.loc[shuffled_rows]

In [250]:
#in paws we can use SaxsClassifier to recreate the old models and scalers and then we can update them using code below

# bad data
# updated the scaler:
bad_data_scaler.partial_fit(new_data[features])
# update the model
bad_data_model.partial_fit(bad_data_scaler.transform(new_data[features]), new_data['bad_data'], classes=[True, False])

scores = bad_data_model.score(bad_data_scaler.transform(new_data[features]), new_data['bad_data'])
print(scores)

0.997885835095


In [252]:
# for form, precursor, structure labels we will use only data with bad_data = False
new_data_good = new_data[new_data['bad_data']==False]

In [253]:
# form factor scattering
# updated the scaler:
form_scaler.partial_fit(new_data_good[features])
# update the model
form_model.partial_fit(form_scaler.transform(new_data_good[features]), new_data_good['form'], classes=[True, False])

scores = form_model.score(form_scaler.transform(new_data_good[features]), new_data_good['form'])
print(scores)

0.985138004246


In [256]:
# precursor scattering
# updated the scaler:
precursor_scaler.partial_fit(new_data_good[features])
# update the model
precursor_model.partial_fit(precursor_scaler.transform(new_data_good[features]), new_data_good['precursor'], classes=[True, False])

scores = precursor_model.score(precursor_scaler.transform(new_data_good[features]), new_data_good['precursor'])
print(scores)

0.71974522293


In [258]:
# diffraction peaks
# updated the scaler:
structure_scaler.partial_fit(new_data_good[features])
# update the model
structure_model.partial_fit(structure_scaler.transform(new_data_good[features]), new_data_good['structure'], classes=[True, False])

scores = structure_model.score(structure_scaler.transform(new_data_good[features]), new_data_good['structure'])
print(scores)

0.993630573248


### Save updated scalers and models

In [259]:
current_version = list(map(int,sklearn.__version__.split('.')))
major,minor,patch = current_version

scalers = {} 
models = {}
scalers_and_models = {'version':current_version, 'scalers' : scalers, 'models': models}

scalers['bad_data'] = bad_data_scaler.__dict__
models['bad_data'] = bad_data_model.__dict__

scalers['form_factor_scattering'] = form_scaler.__dict__
models['form_factor_scattering'] = form_model.__dict__

scalers['precursor_scattering'] = precursor_scaler.__dict__
models['precursor_scattering'] = precursor_model.__dict__

scalers['diffraction_peaks'] = structure_scaler.__dict__
models['diffraction_peaks'] = structure_model.__dict__

with open('scalers_and_models.yml', 'w') as yaml_file:
    yaml.dump(scalers_and_models, yaml_file)

In [260]:
with open('scalers_and_models.yml') as info:
      s_and_m = yaml.load(info)
s_and_m

{'models': {'bad_data': {'C': 1.0,
   '_expanded_class_weight': array([ 1.,  1.]),
   'alpha': 0.001,
   'average': False,
   'class_weight': None,
   'classes_': array([False,  True], dtype=bool),
   'coef_': array([[  2.23294798, -11.70177602,   5.25285292,   0.10739561,
             0.23208343]]),
   'epsilon': 0.1,
   'eta0': 0.0,
   'fit_intercept': True,
   'intercept_': array([-12.76920557]),
   'l1_ratio': 0.95,
   'learning_rate': 'optimal',
   'loss': 'log',
   'loss_function_': <sklearn.linear_model.sgd_fast.Log at 0x1a14835b88>,
   'max_iter': 5,
   'n_iter_': 1,
   'n_jobs': 1,
   'penalty': 'elasticnet',
   'power_t': 0.5,
   'random_state': None,
   'shuffle': True,
   't_': 10643.0,
   'tol': None,
   'verbose': 0,
   'warm_start': False},
  'diffraction_peaks': {'C': 1.0,
   '_expanded_class_weight': array([ 1.,  1.]),
   'alpha': 0.001,
   'average': False,
   'class_weight': None,
   'classes_': array([False,  True], dtype=bool),
   'coef_': array([[-9.50249827,  1.4