In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn import metrics
import seaborn as sns
import random

from os import listdir
from os.path import isfile, isdir, join

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

import csv
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

In [34]:
df_work = pd.read_csv('labels_and_features_paws_with_exper.csv')
df_work.head()

Unnamed: 0,experiment,name,q_Imax,Imax_over_Imean,Imax_over_Ilowq,Imax_over_Ihighq,Imax_sharpness,low_q_ratio,high_q_ratio,log_fluctuation,...,95,96,97,98,99,bad_data,form,precursor,structure,str_label
0,R1,R1/R1_1stcool_0001_dz_bgsub,0.04,18.752705,12.085269,2701.70799,1.03987,0.997474,0.002511,7.17144,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
1,R1,R1/R1_1stcool_0002_dz_bgsub,0.04,18.734222,12.072337,2794.106192,1.038783,0.997568,0.002422,7.817512,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
2,R1,R1/R1_1stcool_0003_dz_bgsub,0.04,18.7007,12.058003,2229.34667,1.038475,0.996952,0.003033,5.276569,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
3,R1,R1/R1_1stcool_0004_dz_bgsub,0.04,18.811386,12.116577,3457.665126,1.041716,0.998025,0.001965,7.522117,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_
4,R1,R1/R1_1stcool_0005_dz_bgsub,0.04,18.69533,12.041431,3494.979851,1.038356,0.998059,0.001935,8.932939,...,0.0,0.0,0.0,0.0,0.0,False,True,False,False,form_


In [35]:
df_work.shape

(1946, 115)

In [36]:
# list of experiments
sub_dirs = [f for f in listdir('2016_saxs_data') if isdir(join('2016_saxs_data', f))]
sub_dirs

['R1',
 'R12',
 'R13',
 'R2',
 'R3',
 'R4',
 'R5',
 'R6',
 'R7',
 'Reaction_A',
 'Reaction_B',
 'Reaction_C',
 'Reaction_D',
 'Reaction_E',
 'Reaction_G',
 'Reaction_H']

In [37]:
shuffled_rows = np.random.permutation(df_work.index)
df_work = df_work.loc[shuffled_rows]

In [38]:
by_exp[10].head()

Unnamed: 0,experiment,name,q_Imax,Imax_over_Imean,Imax_over_Ilowq,Imax_over_Ihighq,Imax_sharpness,low_q_ratio,high_q_ratio,log_fluctuation,...,95,96,97,98,99,bad_data,form,precursor,structure,str_label
1613,Reaction_B,Reaction_B/B_90C_0212153333_0001_dz_bgsub,0.04,4.041671,3.05323,9.685882,1.073897,0.849986,0.14886,5.373821,...,0.0,0.0,0.0,0.0,0.0,False,False,True,False,precursor_
1575,Reaction_B,Reaction_B/B_216C_0212154915_0001_dz_bgsub,0.548,1.183388,1.288791,1.031533,1.020713,0.589912,0.408114,3.763672,...,0.0,0.0,0.0,0.0,0.0,True,False,False,False,bad_data
1570,Reaction_B,Reaction_B/B_205C_0212154525_0001_dz_bgsub,0.041,3.542007,2.780201,6.989246,1.226708,0.817933,0.180587,25.021186,...,0.0,0.0,0.0,0.0,0.0,False,False,True,False,precursor_
1568,Reaction_B,Reaction_B/B_200C_0212154406_0001_dz_bgsub,0.041,3.376568,2.751797,5.710163,1.249683,0.787431,0.21065,49.684293,...,0.0,0.0,0.0,0.0,0.0,False,False,True,False,precursor_
1593,Reaction_B,Reaction_B/B_240C_7min_0212160118_0001_dz_bgsub,0.097,54.194364,35.055062,3143.679496,3.064042,0.9938,0.006129,5.521469,...,0.0,0.0,0.0,0.0,0.0,False,True,True,True,form_precursor_structure


In [39]:
features_analytical = ['q_Imax', 'Imax_over_Imean', 'Imax_over_Ilowq',
       'Imax_over_Ihighq', 'Imax_sharpness', 'low_q_ratio', 'high_q_ratio',
       'log_fluctuation']

features60 = ['0',
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49',
       '50', '51', '52', '53', '54', '55', '56', '57', '58', '59'] 
        # since q is in range 0 - 0.6, we have only zeros in "60" - "99"

features_analytical_and_60 = ['q_Imax', 'Imax_over_Imean', 'Imax_over_Ilowq',
       'Imax_over_Ihighq', 'Imax_sharpness', 'low_q_ratio', 'high_q_ratio',
       'log_fluctuation','0',
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13',
       '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37',
       '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49',
       '50', '51', '52', '53', '54', '55', '56', '57', '58', '59' ]

features = [features_analytical, features60,  features_analytical_and_60]

## Bad Data

In [42]:
test_scores_by_ex = []
test_scores_by_ex_weighted = []
for d in sub_dirs:
    print("Experiment ", d)
    train = df_work[df_work['experiment']!= d]
    test = df_work[df_work['experiment']== d]
    print('train.shape = ', train.shape)
    print('test.shape = ', test.shape)

    scaler = preprocessing.StandardScaler()
    scaler.fit(train[features_analytical_and_60])
    log = linear_model.SGDClassifier(alpha= 0.01, l1_ratio= 0.15, loss= 'log', penalty= 'elasticnet')

    scores = cross_val_score(log, scaler.transform(train[features_analytical_and_60]), train['bad_data'], cv=5)
    print("cross validation score: ", scores.mean())

    log.fit(scaler.transform(train[features_analytical_and_60]), train['bad_data'])
    test_score = log.score(scaler.transform(test[features_analytical_and_60]), test['bad_data'])
    print('test score: ', test_score)
    test_scores_by_ex.append(test_score)
    test_scores_by_ex_weighted.append(test_score * len(test) / len(df_work))
    print()
    
print('Not weighted average test score: ', np.mean(test_scores_by_ex))
print('Weighted average test score: ', np.sum(test_scores_by_ex_weighted))

Experiment  R1
train.shape =  (1653, 115)
test.shape =  (293, 115)
cross validation score:  0.997581250572
test score:  0.931740614334

Experiment  R12
train.shape =  (1640, 115)
test.shape =  (306, 115)
cross validation score:  0.993292682927
test score:  0.996732026144

Experiment  R13
train.shape =  (1843, 115)
test.shape =  (103, 115)
cross validation score:  0.992398660591
test score:  0.980582524272

Experiment  R2
train.shape =  (1826, 115)
test.shape =  (120, 115)
cross validation score:  0.990690895301
test score:  1.0

Experiment  R3
train.shape =  (1801, 115)
test.shape =  (145, 115)
cross validation score:  0.990561694149
test score:  1.0

Experiment  R4
train.shape =  (1819, 115)
test.shape =  (127, 115)
cross validation score:  0.991756670063
test score:  1.0

Experiment  R5
train.shape =  (1763, 115)
test.shape =  (183, 115)
cross validation score:  0.996595728738
test score:  0.939890710383

Experiment  R6
train.shape =  (1839, 115)
test.shape =  (107, 115)
cross valida

## Form

In [43]:
# now we will use only "good data"
df_work = df_work[df_work['bad_data']==False]

In [46]:
test_scores_by_ex = []
test_scores_by_ex_weighted = []

for d in sub_dirs:
    print("Experiment ", d)
    train_good = df_work[df_work['experiment']!= d]
    test_good = df_work[df_work['experiment']== d]
   
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_good[features60])
    log = linear_model.SGDClassifier(alpha= 0.001, l1_ratio= 0, loss= 'log', penalty= 'l2')

    scores = cross_val_score(log, scaler.transform(train_good[features60]), train_good['form'], cv=5)
    print("cross validation score: ", scores.mean())

    log.fit(scaler.transform(train_good[features60]), train_good['form'])
    test_score = log.score(scaler.transform(test_good[features60]), test_good['form'])
    print("test score: ", test_score)
    test_scores_by_ex.append(test_score)
    test_scores_by_ex_weighted.append(test_score * len(test_good) / len(df_work))
    print()
    
print('Not weighted average test score: ', np.mean(test_scores_by_ex))
print('Weighted average test score: ', np.sum(test_scores_by_ex_weighted))

Experiment  R1
cross validation score:  0.996719140345
test score:  1.0

Experiment  R12
cross validation score:  0.996164073135
test score:  1.0

Experiment  R13
cross validation score:  0.995899572979
test score:  1.0

Experiment  R2
cross validation score:  0.994592785557
test score:  0.991666666667

Experiment  R3
cross validation score:  0.989611769971
test score:  0.986206896552

Experiment  R4
cross validation score:  0.995774032687
test score:  1.0

Experiment  R5
cross validation score:  0.995007788162
test score:  0.994535519126

Experiment  R6
cross validation score:  0.995835067984
test score:  1.0

Experiment  R7
cross validation score:  0.996480938416
test score:  0.9875

Experiment  Reaction_A
cross validation score:  0.996482653359
test score:  1.0

Experiment  Reaction_B
cross validation score:  0.973257499157
test score:  1.0

Experiment  Reaction_C
cross validation score:  0.984695440306
test score:  1.0

Experiment  Reaction_D
cross validation score:  0.996501457726

## Precursor

In [47]:
test_scores_by_ex = []
test_scores_by_ex_weighted = []

for d in sub_dirs:
    print("Experiment ", d)
    train_good = df_work[df_work['experiment']!= d]
    test_good = df_work[df_work['experiment']== d]
   
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_good[features_analytical_and_60])
    log = linear_model.SGDClassifier(alpha= 0.001, l1_ratio= 0, loss= 'log', penalty= 'none')

    scores = cross_val_score(log, scaler.transform(train_good[features_analytical_and_60]), train_good['precursor'], cv=5)
    print("cross validation score: ", scores.mean())

    log.fit(scaler.transform(train_good[features_analytical_and_60]), train_good['precursor'])
    test_score = log.score(scaler.transform(test_good[features_analytical_and_60]), test_good['precursor'])
    print("test score: ", test_score)
    test_scores_by_ex.append(test_score)
    test_scores_by_ex_weighted.append(test_score * len(test_good) / len(df_work))
    print()
    
print('Not weighted average test score: ', np.mean(test_scores_by_ex))
print('Weighted average test score: ', np.sum(test_scores_by_ex_weighted))

Experiment  R1
cross validation score:  0.931161977589
test score:  0.984555984556

Experiment  R12
cross validation score:  0.938668100009
test score:  0.552511415525

Experiment  R13
cross validation score:  0.93323200042
test score:  0.909090909091

Experiment  R2
cross validation score:  0.945337838379
test score:  0.408333333333

Experiment  R3
cross validation score:  0.944474901171
test score:  0.351724137931

Experiment  R4
cross validation score:  0.949927201252
test score:  0.913385826772

Experiment  R5
cross validation score:  0.931300623053
test score:  0.96174863388

Experiment  R6
cross validation score:  0.935172036174
test score:  0.922330097087

Experiment  R7
cross validation score:  0.931350406088
test score:  0.7

Experiment  Reaction_A
cross validation score:  0.927977308564
test score:  0.526315789474

Experiment  Reaction_B
cross validation score:  0.926209976407
test score:  0.888888888889

Experiment  Reaction_C
cross validation score:  0.929248172642
test sco

## Structure

In [48]:
test_scores_by_ex = []
test_scores_by_ex_weighted = []

for d in sub_dirs:
    print("Experiment ", d)
    train_good = df_work[df_work['experiment']!= d]
    test_good = df_work[df_work['experiment']== d]
   
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_good[features_analytical_and_60])
    log = linear_model.SGDClassifier(alpha= 0.0001, l1_ratio= 0.95, loss= 'log', penalty= 'none')

    scores = cross_val_score(log, scaler.transform(train_good[features_analytical_and_60]), train_good['structure'], cv=5)
    print("cross validation score: ", scores.mean())

    log.fit(scaler.transform(train_good[features_analytical_and_60]), train_good['structure'])
    test_score = log.score(scaler.transform(test_good[features_analytical_and_60]), test_good['structure'])
    print("test score: ", test_score)
    test_scores_by_ex.append(test_score)
    test_scores_by_ex_weighted.append(test_score * len(test_good) / len(df_work))
    print()
    
print('Not weighted average test score: ', np.mean(test_scores_by_ex))
print('Weighted average test score: ', np.sum(test_scores_by_ex_weighted))

Experiment  R1
cross validation score:  0.982286312857
test score:  0.972972972973

Experiment  R12
cross validation score:  0.98147365326
test score:  0.977168949772

Experiment  R13
cross validation score:  0.978906149808
test score:  1.0

Experiment  R2
cross validation score:  0.965756720576
test score:  0.908333333333

Experiment  R3
cross validation score:  0.960874859411
test score:  0.455172413793

Experiment  R4
cross validation score:  0.943340734539
test score:  1.0

Experiment  R5
cross validation score:  0.986255841121
test score:  0.879781420765

Experiment  R6
cross validation score:  0.984530811394
test score:  0.990291262136

Experiment  R7
cross validation score:  0.983563912368
test score:  0.9875

Experiment  Reaction_A
cross validation score:  0.972457976789
test score:  1.0

Experiment  Reaction_B
cross validation score:  0.987214357937
test score:  0.984126984127

Experiment  Reaction_C
cross validation score:  0.959890681936
test score:  1.0

Experiment  Reactio