In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import csv
import yaml
from os import listdir
from os.path import isfile, isdir, join, splitext

In [10]:
# modified function form paws. 
def profile_spectrum(q_I, name, folder, labels_dictionary):
    """Numerical profiling of a SAXS spectrum.

    Profile a saxs spectrum (n-by-2 array q_I) 
    by taking several fast numerical metrics 
    from the measured data.
    The metrics should be consistent for spectra
    with different intensity scaling 
    or different q domains.   
 
    Returns a dictionary of scalar metrics.
    """
    features_labels = []
    features_labels.append(folder) # experiment
    features_labels.append(name) # file name
    
    q = np.array(q_I['# q (1/Angstrom)'])
    I = np.array(q_I[' Intensity (counts)'])
    # I metrics
    idxmax = np.argmax(I)
    idxmin = np.argmin(I)
    I_min = I[idxmin]
    I_max = I[idxmax] 
    q_Imax = q[idxmax]
    I_range = I_max - I_min
    I_mean = np.mean(I)
    Imax_over_Imean = I_max/I_mean
    nz = I>0
    q_nz = q[nz]
    I_nz = I[nz]
    logI_nz = np.log(I_nz)
    logI_max = np.max(logI_nz)
    logI_min = np.min(logI_nz)
    logI_range = logI_max - logI_min
    
    # I_max peak shape analysis
    idx_around_max = ((q > 0.9*q_Imax) & (q < 1.1*q_Imax))
    Imean_around_max = np.mean(I[idx_around_max])
    Imax_sharpness = I_max / Imean_around_max
    # Integration... 
    logI_std = np.std(logI_nz)
    logI_max_over_std = logI_max / logI_std

    ### fluctuation analysis
    # array of the difference between neighboring points:
    nn_diff = logI_nz[1:]-logI_nz[:-1]
    # keep indices where the sign of this difference changes.
    # also keep first index
    nn_diff_prod = nn_diff[1:]*nn_diff[:-1]
    idx_keep = np.hstack((np.array([True]),nn_diff_prod<0))
    fluc = np.sum(np.abs(nn_diff[idx_keep]))
    logI_fluctuation = fluc/logI_range
    
    features_labels.append(q_Imax)
    features_labels.append(Imax_over_Imean)
    features_labels.append(Imax_sharpness)
    features_labels.append(logI_fluctuation)
    features_labels.append(logI_max_over_std)
    
    #LABELS
    bad_data_flags = labels_dictionary['bad_data_flags']
    form_flags = labels_dictionary['form_flags']
    precursor_flags = labels_dictionary['precursor_flags']
    structure_flags = labels_dictionary['structure_flags']
    
    srt_label = ''
    b_name = name.encode('utf-8')
    bad_data_flag = bad_data_flags[b_name]
    features_labels.append(bad_data_flag)
    
    if bad_data_flag:
        features_labels.append(False) # for form
        features_labels.append(False) # for precursor
        features_labels.append(False) # for structure
        srt_label = 'bad_data'
    else:
        form_flag = form_flags[b_name]
        features_labels.append(form_flag)
        if form_flag:
            srt_label += 'form_'
        precursor_flag = precursor_flags[b_name]
        features_labels.append(precursor_flag)
        if precursor_flag:
            srt_label += 'precursor_'
        structure_flag = structure_flags[b_name]
        features_labels.append(structure_flag)
        if structure_flag:
            srt_label += 'structure'
   
    features_labels.append(srt_label)
    
    return features_labels

In [11]:
sub_dirs = [f for f in listdir('../2016_saxs_data') if isdir(join('../2016_saxs_data', f))]
print(len(sub_dirs))
sub_dirs

16


['R1',
 'R12',
 'R13',
 'R2',
 'R3',
 'R4',
 'R5',
 'R6',
 'R7',
 'Reaction_A',
 'Reaction_B',
 'Reaction_C',
 'Reaction_D',
 'Reaction_E',
 'Reaction_G',
 'Reaction_H']

In [12]:
tr_data = []
for d in sub_dirs:
    count = 0
    all_files = [f for f in listdir('../2016_saxs_data/' + d) if isfile(join('../2016_saxs_data', d, f))]#.csv and yaml included 
    labels_dictionary = yaml.load(open('../2016_saxs_data/'+ d + '/data_labels_chronological.yml'))
    for f in all_files:
        try:
            df = pd.read_csv('../2016_saxs_data/' + d + '/' + f)
            name = splitext(f)[0]
            tr_data.append(profile_spectrum(df, name, d, labels_dictionary))
            count += 1
        except:
            continue
    print(count, d)

293 R1
306 R12
103 R13
120 R2
145 R3
127 R4
183 R5
107 R6
89 R7
76 Reaction_A
65 Reaction_B
88 Reaction_C
70 Reaction_D
60 Reaction_E
51 Reaction_G
63 Reaction_H


In [13]:
len(tr_data)

1946

In [18]:
writer50 = csv.writer(open("labels_and_features_by_experiments.csv", 'w'))
writer50.writerow(['experiment','name', 'q_Imax', 'Imax_over_Imean','Imax_sharpness', 'logI_fluctuation', 
                   'logI_max_over_std', 'bad_data', 'form', 'precursor', 'structure', 'str_label'])
for row in tr_data:
    writer50.writerow(row)

In [19]:
df_t2 = pd.read_csv('labels_and_features_by_experiments.csv')
df_t2.head()

Unnamed: 0,experiment,name,q_Imax,Imax_over_Imean,Imax_sharpness,logI_fluctuation,logI_max_over_std,bad_data,form,precursor,structure,str_label
0,R1,R1_1stcool_0001_dz_bgsub,0.04,18.752705,1.03987,5.330748,3.075002,False,True,False,False,form_
1,R1,R1_1stcool_0002_dz_bgsub,0.04,18.734222,1.038783,4.951954,3.026496,False,True,False,False,form_
2,R1,R1_1stcool_0003_dz_bgsub,0.04,18.7007,1.038475,4.222822,3.19609,False,True,False,False,form_
3,R1,R1_1stcool_0004_dz_bgsub,0.04,18.811386,1.041716,5.150165,2.970743,False,True,False,False,form_
4,R1,R1_1stcool_0005_dz_bgsub,0.04,18.69533,1.038356,5.528953,2.947531,False,True,False,False,form_


In [20]:
df_t2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 12 columns):
experiment           1946 non-null object
name                 1946 non-null object
q_Imax               1946 non-null float64
Imax_over_Imean      1946 non-null float64
Imax_sharpness       1946 non-null float64
logI_fluctuation     1946 non-null float64
logI_max_over_std    1946 non-null float64
bad_data             1946 non-null bool
form                 1946 non-null bool
precursor            1946 non-null bool
structure            1946 non-null bool
str_label            1946 non-null object
dtypes: bool(4), float64(5), object(3)
memory usage: 129.3+ KB
