In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import csv
import yaml
from os import listdir
from os.path import isfile, isdir, join, splitext

In [2]:
def extract_features_and_labels_20(d, name, labels_dictionary):
    features = []
    q = np.array(d['# q (1/Angstrom)'])
    I = np.array(d[' Intensity (counts)'])
    
    idxmax = np.argmax(I)
    Imax = I[idxmax]
    q_Imax = q[idxmax]
    
    idxmin = np.argmin(I)
    Imin = I[idxmin]
    Irange = Imax - Imin
    Imean = np.mean(I)
    Imax_over_Imean = float(Imax)/float(Imean)
    
    idx_around_max = ((q > 0.9*q_Imax) & (q < 1.1*q_Imax))
    Imean_around_max = np.mean(I[idx_around_max])
    Imax_over_Imean_local = Imax / Imean_around_max
    
    ### fluctuation analysis
    # array of the difference between neighboring points:
    nn_diff = I[1:]-I[:-1]
    # keep indices where the sign of this difference changes.
    # also keep first index
    nn_diff_prod = nn_diff[1:]*nn_diff[:-1]
    idx_keep = np.hstack((np.array([True]),nn_diff_prod<0))
    fluc = np.sum(np.abs(nn_diff[idx_keep]))
    fluctuation_strength = fluc/Imean
    
    I_sum = np.sum(I)
    low_q_ratio = np.sum(I[(q<0.4)])/I_sum
    high_q_ratio = np.sum(I[(q>=0.4)])/I_sum
    
    ### curve shape analysis
    lowq_idx = q<0.1
    highq_idx = q>0.4
    lowq = q[lowq_idx]
    highq = q[highq_idx]
    I_lowq = I[lowq_idx]
    I_highq = I[highq_idx]
    I_lowq_mean = np.mean(I_lowq)
    I_highq_mean = np.mean(I_highq)
    Imax_over_Ilowq = float(Imax)/I_lowq_mean
    Ilowq_over_Ihighq = I_lowq_mean/I_highq_mean
    Imax_over_Ihighq = float(Imax)/I_highq_mean
    
    bin_strengths = np.zeros(20)
    for i in range(20):
            qmini, qmaxi = i*0.05, (i+1)*0.05
            idxi = ((q>=qmini) & (q<qmaxi))
            if any(idxi):
                qi = q[ idxi ]
                Ii = I[ idxi ]/Imax # /Imax added
                dqi = qi[1:]-qi[:-1]
                Ii = (Ii[1:]+Ii[:-1])/2
                bin_strengths[i] = np.sum(np.log(Ii) * dqi) / (qi[-1]-qi[0])
  

    features.append(name)
    
    #append features
    features.append(q_Imax)
    features.append(Imax_over_Imean)
    features.append(Imax_over_Imean_local)
    features.append(fluctuation_strength)
    features.append(low_q_ratio)
    features.append(high_q_ratio)
    features.append(Imax_over_Ilowq)
    features.append(Imax_over_Ihighq)
    features.append(Ilowq_over_Ihighq)
    
    for s in bin_strengths:
        features.append(s)
          
    #LABELS
    bad_data_flags = labels_dictionary['bad_data_flags']
    form_flags = labels_dictionary['form_flags']
    precursor_flags = labels_dictionary['precursor_flags']
    structure_flags = labels_dictionary['structure_flags']
    
    srt_label = ''
    b_name = name.encode('utf-8')
    bad_data_flag = bad_data_flags[b_name]
    features.append(bad_data_flag)
    
    if bad_data_flag:
        features.append(False) # for form
        features.append(False) # for precursor
        features.append(False) # for structure
        srt_label = 'bad_data'
    else:
        form_flag = form_flags[b_name]
        features.append(form_flag)
        if form_flag:
            srt_label += 'form_'
        precursor_flag = precursor_flags[b_name]
        features.append(precursor_flag)
        if precursor_flag:
            srt_label += 'precursor_'
        structure_flag = structure_flags[b_name]
        features.append(structure_flag)
        if structure_flag:
            srt_label += 'structure'
   
    features.append(srt_label)
                  
    return features

In [9]:
sub_dirs = [f for f in listdir('2016_saxs_data') if isdir(join('2016_saxs_data', f))]
print(len(sub_dirs))
sub_dirs

16


['R1',
 'R12',
 'R13',
 'R2',
 'R3',
 'R4',
 'R5',
 'R6',
 'R7',
 'Reaction_A',
 'Reaction_B',
 'Reaction_C',
 'Reaction_D',
 'Reaction_E',
 'Reaction_G',
 'Reaction_H']

In [10]:
for d in sub_dirs:
    count = 0
    all_files = [f for f in listdir('2016_saxs_data/' + d) if isfile(join('2016_saxs_data', d, f))]#.csv and yaml included 
    for f in all_files:
        count += 1
    print(count, d)

587 R1
613 R12
207 R13
241 R2
291 R3
255 R4
370 R5
215 R6
179 R7
153 Reaction_A
131 Reaction_B
177 Reaction_C
141 Reaction_D
121 Reaction_E
103 Reaction_G
127 Reaction_H


In [27]:
tr_data = [] # handlabeled data for training
for d in sub_dirs:
    count = 0
    all_files = [f for f in listdir('2016_saxs_data/' + d) if isfile(join('2016_saxs_data', d, f))]#.csv and yaml included 
    labels_dictionary = yaml.load(open('2016_saxs_data/'+ d + '/data_labels_chronological.yml'))
    #print(label_dictionary)
    for f in all_files:
        try:
            df = pd.read_csv('2016_saxs_data/' + d + '/' + f)
            #name = f.split('.')
            name = splitext(f)[0]
            #name = name[0]
            tr_data.append(extract_features_and_labels_20(df, name, labels_dictionary))
            count += 1
        except:
            continue
    print(count, d)

293 R1
306 R12
103 R13
120 R2
145 R3
127 R4
183 R5
107 R6
89 R7
76 Reaction_A
65 Reaction_B
88 Reaction_C
70 Reaction_D
60 Reaction_E
51 Reaction_G
63 Reaction_H


In [28]:
len(tr_data)

1946

In [32]:
writer20 = csv.writer(open("labels_and_features_20.csv", 'w'))
writer20.writerow(['name', 'q_Imax', 'Imax_over_Imean','Imax_over_Imean_local', 'fluctuation_strength', 'low_q_ratio',
                 'high_q_ratio', 'Imax_over_Ilowq','Imax_over_Ihighq', 'Ilowq_over_Ihighq', 'b_s_1', 'b_s_2','b_s_3',
                 'b_s_4', 'b_s_5', 'b_s_6', 'b_s_7', 'b_s_8', 'b_s_9', 'b_s_10','b_s_11', 'b_s_12','b_s_13',
                 'b_s_14', 'b_s_15', 'b_s_16', 'b_s_17', 'b_s_18', 'b_s_19', 'b_s_20',
                   'bad_data', 'form', 'precursor', 'structure', 'str_label' ])
for row in tr_data:
    writer20.writerow(row)

In [33]:
df_t2 = pd.read_csv('labels_and_features_20.csv')
df_t2.head()

Unnamed: 0,name,q_Imax,Imax_over_Imean,Imax_over_Imean_local,fluctuation_strength,low_q_ratio,high_q_ratio,Imax_over_Ilowq,Imax_over_Ihighq,Ilowq_over_Ihighq,...,b_s_16,b_s_17,b_s_18,b_s_19,b_s_20,bad_data,form,precursor,structure,str_label
0,R1_1stcool_0001_dz_bgsub,0.04,18.752705,1.03987,0.811895,0.997521,0.002479,2.115803,2701.70799,1276.918254,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
1,R1_1stcool_0002_dz_bgsub,0.04,18.734222,1.038783,0.783708,0.997605,0.002395,2.113128,2794.106192,1322.260858,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
2,R1_1stcool_0003_dz_bgsub,0.04,18.7007,1.038475,0.776179,0.997004,0.002996,2.113709,2229.34667,1054.708362,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
3,R1_1stcool_0004_dz_bgsub,0.04,18.811386,1.041716,0.70675,0.998057,0.001943,2.120315,3457.665126,1630.731639,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
4,R1_1stcool_0005_dz_bgsub,0.04,18.69533,1.038356,0.757558,0.99809,0.00191,2.108819,3494.979851,1657.316125,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_


In [34]:
df_t2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 35 columns):
name                     1946 non-null object
q_Imax                   1946 non-null float64
Imax_over_Imean          1946 non-null float64
Imax_over_Imean_local    1946 non-null float64
fluctuation_strength     1946 non-null float64
low_q_ratio              1946 non-null float64
high_q_ratio             1946 non-null float64
Imax_over_Ilowq          1946 non-null float64
Imax_over_Ihighq         1946 non-null float64
Ilowq_over_Ihighq        1946 non-null float64
b_s_1                    1946 non-null float64
b_s_2                    1946 non-null float64
b_s_3                    1946 non-null float64
b_s_4                    1946 non-null float64
b_s_5                    1946 non-null float64
b_s_6                    1946 non-null float64
b_s_7                    1946 non-null float64
b_s_8                    1946 non-null float64
b_s_9                    1946 non-null float64
b