# Getting data from Citrination

In [325]:
from citrination_client import CitrinationClient
from citrination_client import PifQuery
from pypif.pif import dumps
import json 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import pickle

In [103]:
with open("citrination_api_key_ssrl.txt", "r") as g: 
    api_key = g.readline()

In [106]:
a_key = api_key.strip()

In [107]:
client = CitrinationClient(site='https://slac.citrination.com',api_key=a_key ) 

In [260]:
list_of_datasets = [1,15,16]

In [307]:
def get_data_from_Citrination(client, dataset_id_list):
    all_samples_names = [] # list of all id only - we can use it for random access
    data = {} # dict where keys are ids of samples
    for dataset in dataset_id_list:
        query_dataset = PifQuery(include_datasets=[dataset])
        query_result = client.search(query_dataset)
        pifs = [x.system for x in query_result.hits]
        for line in pifs:
            my_str = dumps(line)
            obj = json.loads(my_str) # to transform the string to dictionary
            for pr in obj['properties']:
                if pr['name'] == 'SAXS intensity':
                    q_list_of_dic = pr['conditions'][0]['scalars'] # q
                    q_list = []
                    for i in q_list_of_dic:
                        for k, v in i.items():
                            q_list.append(v)
                    I_list_of_dic = pr['scalars']# I
                    I_list = []
                    for i in I_list_of_dic:
                        for k, v in i.items():
                            I_list.append(v)
                    if (len(q_list) != 560 or len(I_list) != 560):
                        continue
                    sample_id = "set_" + str(dataset) + "_" + obj['uid']
                    all_samples_names.append(sample_id)
                    df = pd.DataFrame.from_dict({'q': q_list, 'I' : I_list})
                    df = df.astype(float)
                    data[sample_id] = df
    return all_samples_names, data

In [314]:
def extract_features(df, name):
    features = []
    q = np.array(df['q'])
    I = np.array(df['I'])
    
    idxmax = np.argmax(I)
    Imax = I[idxmax]
    q_Imax = q[idxmax]
    
    idxmin = np.argmin(I)
    Imin = I[idxmin]
    Irange = Imax - Imin
    Imean = np.mean(I)
    Imax_over_Imean = float(Imax)/float(Imean)
    
    idx_around_max = ((q > 0.9*q_Imax) & (q < 1.1*q_Imax))
    Imean_around_max = np.mean(I[idx_around_max])
    Imax_over_Imean_local = Imax / Imean_around_max
    
    ### fluctuation analysis
    # array of the difference between neighboring points:
    nn_diff = I[1:]-I[:-1]
    # keep indices where the sign of this difference changes.
    # also keep first index
    nn_diff_prod = nn_diff[1:]*nn_diff[:-1]
    idx_keep = np.hstack((np.array([True]),nn_diff_prod<0))
    fluc = np.sum(np.abs(nn_diff[idx_keep]))
    fluctuation_strength = fluc/Imean
    
    I_sum = np.sum(I)
    low_q_ratio = np.sum(I[(q<0.4)])/I_sum
    high_q_ratio = np.sum(I[(q>=0.4)])/I_sum
    
    ### curve shape analysis
    lowq_idx = q<0.1
    highq_idx = q>0.4
    lowq = q[lowq_idx]
    highq = q[highq_idx]
    I_lowq = I[lowq_idx]
    I_highq = I[highq_idx]
    I_lowq_mean = np.mean(I_lowq)
    I_highq_mean = np.mean(I_highq)
    Imax_over_Ilowq = float(Imax)/I_lowq_mean
    Ilowq_over_Ihighq = I_lowq_mean/I_highq_mean
    Imax_over_Ihighq = float(Imax)/I_highq_mean
    
    bin_strengths = np.zeros(20)
    for i in range(20):
            qmini, qmaxi = i*0.05, (i+1)*0.05
            idxi = ((q>=qmini) & (q<qmaxi))
            if any(idxi):
                qi = q[ idxi ]
                Ii = I[ idxi ]/Imax # /Imax added
                dqi = qi[1:]-qi[:-1]
                Ii = (Ii[1:]+Ii[:-1])/2
                bin_strengths[i] = np.sum(np.log(Ii) * dqi) / (qi[-1]-qi[0])
  

    features.append(name)
    features.append(q_Imax)
    features.append(Imax_over_Imean)
    features.append(Imax_over_Imean_local)
    features.append(fluctuation_strength)
    features.append(low_q_ratio)
    features.append(high_q_ratio)
    features.append(Imax_over_Ilowq)
    features.append(Imax_over_Ihighq)
    features.append(Ilowq_over_Ihighq)
    
    for s in bin_strengths:
        features.append(s)
                  
    return tuple(features)

In [324]:
def create_data_frame(data, names):
    data_set = [] 

    columns = ['name', 'q_Imax', 'Imax_over_Imean','Imax_over_Imean_local', 'fluctuation_strength', 'low_q_ratio',
                 'high_q_ratio', 'Imax_over_Ilowq','Imax_over_Ihighq', 'Ilowq_over_Ihighq', 'b_s_1', 'b_s_2','b_s_3',
                 'b_s_4', 'b_s_5', 'b_s_6', 'b_s_7', 'b_s_8', 'b_s_9', 'b_s_10','b_s_11', 'b_s_12','b_s_13',
                 'b_s_14', 'b_s_15', 'b_s_16', 'b_s_17', 'b_s_18', 'b_s_19', 'b_s_20' ]
    for f in names:
        df = data[f]
        data_set.append(extract_features(df, f))
    data_frame = pd.DataFrame.from_records(data_set, columns=columns)
    return data_frame

## get_data_and_create_df(client, dataset_id_list)

In [326]:
from sklearn.cluster import MiniBatchKMeans
from sklearn import preprocessing

def get_data_and_create_df(client, dataset_id_list):
    names, data = get_data_from_Citrination(client, dataset_id_list)
    df = create_data_frame(data, names)
    
    scaler = preprocessing.StandardScaler()
    scaler.fit(df[features])
    # put my best unsupervised model here
    # for example:
    # clusterer = MiniBatchKMeans(n_clusters=n_clusters, random_state=10)
    # clusterer.fit(scaler.transform(df[features]))
    #model.to_pickle('unsupervised_model.pkl')

In [None]:
# it is a temporary function for testing
def create_and_save_model(client, dataset_id_list):
    names, data = get_data_from_Citrination(client, dataset_id_list)
    df = create_data_frame(data, names)
    return df

In [327]:
data_frame = get_data_and_create_df(client, list_of_datasets)

In [328]:
data_frame.head()

Unnamed: 0,name,q_Imax,Imax_over_Imean,Imax_over_Imean_local,fluctuation_strength,low_q_ratio,high_q_ratio,Imax_over_Ilowq,Imax_over_Ihighq,Ilowq_over_Ihighq,...,b_s_11,b_s_12,b_s_13,b_s_14,b_s_15,b_s_16,b_s_17,b_s_18,b_s_19,b_s_20
0,set_1_R1_1479582658,0.489,1.343762,1.302965,14.996414,0.633562,0.366438,1.725431,1.309675,0.759042,...,-0.29366,-0.332603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,set_1_R1_1479582669,0.275,1.234448,1.133807,17.833578,0.634872,0.365128,1.609803,1.20745,0.750061,...,-0.200915,-0.247893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,set_1_R1_1479582681,0.103,1.265933,1.338246,17.744758,0.636601,0.363399,1.630249,1.244138,0.763158,...,-0.218888,-0.29177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,set_1_R1_1479582692,0.447,1.233866,1.161025,20.241095,0.634705,0.365295,1.615438,1.206331,0.746752,...,-0.191737,-0.271467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,set_1_R1_1479582703,0.187,1.249696,1.234976,17.060467,0.63593,0.36407,1.59084,1.225917,0.77061,...,-0.205632,-0.275274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [329]:
data_frame.shape

(2170, 30)

In [331]:
#save the data frame
data_frame.to_pickle('df_for_clustering.pkl')