In [14]:
import os
import sys
from datetime import datetime
import keras

import argparse
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard
from keras.optimizers import Adam, SGD, Adamax
import keras.backend as K

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from func import * 

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()

In [15]:
#### parser
parser = argparse.ArgumentParser()
parser.add_argument('-d' ,'--data', type=str, default='/home/katieyth/gynecology/data/data_cmu_ctu.csv', help='data')
parser.add_argument('-s' ,'--model_save', type=str, default='/home/katieyth/gynecology/model_save/', help='model save path')
parser.add_argument('-y' ,'--target', type=str, default='multi', help='prediction target')
# variability	UA	 deceleration management
 
# input parameter
parser.add_argument('-th','--acceptable_zeros_threshold', type=float, default=200, help='acceptable number of missing values in raw data')
parser.add_argument('-l' ,'--length', type=int, default=600, help='length of input')
parser.add_argument('-ks','--k_slice', type=int, default=1, help='a input will be sliced into k_slice segments when testing')
parser.add_argument('-c' ,'--n_channel', type=int, default=2, help='number of input channels')
parser.add_argument('-rn','--random_noise', type=int, default=0, help='add Gaussian noise (mean=0, std=0.01) into inputs')
parser.add_argument('-nm','--normalized', type=int, default=1, help='whether conduct channel-wise normalization')
parser.add_argument('-ctu_cmu' ,'--ctu_cmu', type=str, default='cmu', help='train_ctu_test_cmu')

# data augmentation 
parser.add_argument('-aug_fliplr' ,'--aug_fliplr', type=int, default=0, help='reverse time series')
parser.add_argument('-shift' ,'--DA_Shift', type=int, default=1, help='')
parser.add_argument('-scale' ,'--DA_Scale', type=int, default=1, help='')
parser.add_argument('-randsamp' ,'--DA_RandSampling', type=int, default=1, help='')


# model parameters
parser.add_argument('-struc' ,'--struc', type=str, default='mimic_previous_FHB', help='deeper or shallower')
parser.add_argument('-k' ,'--kernel_size', type=int, default=3, help='kernel size')
parser.add_argument('-f' ,'--filters', type=int, default=64, help='base number of filters')
parser.add_argument('-ly' ,'--layers', type=int, default=5, help='number of residual layers')
parser.add_argument('-a' ,'--activation', type=str, default='relu', help='activation function')
parser.add_argument('-i' ,'--kernel_initializer', type=str, default='RandomNormal', help='kernel initialization method')
parser.add_argument('-l2','--l2', type=float, default=0.01, help='coefficient of l2 regularization')

# hyper-parameters
parser.add_argument('-lr','--learning_rate', type=float, default=1e-4, help='learning_rate')
parser.add_argument('-reduce_lr_patience','--reduce_lr_patience', type=int, default=50, help='reduce_lr_patience')
parser.add_argument('-bs','--batch_size', type=int, default=27, help='batch_size')
parser.add_argument('-ep','--epoch', type=int, default=15, help='epoch')
parser.add_argument('-wb','--weight_balance', type=int, default=1, help='whether weight balancing or not')
parser.add_argument('-mntr','--monitor', type=str, default='val_man_acc', help='val_acc or val_loss')

parser.add_argument('-g' ,'--gpu_id', type=str, default='7', help='GPU ID')
parser.add_argument('-rs' ,'--random_state', type=int, default=13, help='random state when train_test_split')
parser.add_argument('-fn' ,'--summary_file', type=str, default=None, help='summary filename')

FLAG = parser.parse_args([])
os.environ['CUDA_VISIBLE_DEVICES'] = FLAG.gpu_id

In [16]:
def data_preprocess_test(Xvalid, Yvalid, length=600):
    Xtest = np.empty((Xvalid.shape[0], length, Xvalid.shape[2]))
    for i in range(Xvalid.shape[0]):
        Xtest[i,:,:] = data_normalize(Xvalid[i,0:600,:])
    Ytest = Yvalid.copy()
    return Xtest, Ytest

In [17]:
def func_FHB2class(x):
    if  x <= 110:
        return "1"
    elif x <= 160:
        return "0"
    else:
        return "1"

In [18]:
### data preparing
d = pd.read_csv(os.path.join(FLAG.data))
d['FHB_class'] = d['FHB'].apply(func_FHB2class)


# replace 0 (no readings) with np.nan for later substitution
for k in d.columns:
    if 'b-' in k or 'm-' in k:
        print(k, end='\r')
        d.loc[d[k]==0, k] = np.nan

# choose dataset type
if FLAG.ctu_cmu == 'trans':
    train_d = d[d['ID'].str.contains('CTU_')]
    valid_d = d[d['ID'].str.contains('CMU_')]
elif FLAG.ctu_cmu == 'mix':
    train_d,valid_d = train_test_split(d, test_size=0.3, random_state=FLAG.random_state, stratify =d['management'])
elif FLAG.ctu_cmu == 'cmu':
    dd = d[d['ID'].str.contains('CMU_')]
    dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
    dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
    dd_split = dd[dd['multi_label_count']!=1]
    dd_force_train = dd[dd['multi_label_count']==1]
    train_d,valid_d = train_test_split(dd_split, test_size=0.3, random_state=FLAG.random_state, stratify =dd_split['multi_label'])
    train_d = pd.concat([train_d, dd_force_train])
elif FLAG.ctu_cmu == 'ctu':
    dd = d[d['ID'].str.contains('CTU_')]
    dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
    dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
    dd_split = dd[dd['multi_label_count']!=1]
    dd_force_train = dd[dd['multi_label_count']==1]
    train_d,valid_d = train_test_split(dd_split, test_size=0.3, random_state=FLAG.random_state, stratify =dd_split['multi_label'])
    train_d = pd.concat([train_d, dd_force_train])
elif FLAG.ctu_cmu == 'trans_inv':
    train_d = d[d['ID'].str.contains('CMU_')]
    valid_d = d[d['ID'].str.contains('CTU_')]
    
# interpolate missing values
train_db = np.array(train_d[[k for k in train_d.columns if 'b-' in k]].interpolate(limit_direction='both', axis=1), dtype=np.float)
train_dm = np.array(train_d[[k for k in train_d.columns if 'm-' in k]].interpolate(limit_direction='both', axis=1), dtype=np.float)
valid_db = np.array(valid_d[[k for k in valid_d.columns if 'b-' in k]].interpolate(limit_direction='both', axis=1), dtype=np.float)
valid_dm = np.array(valid_d[[k for k in valid_d.columns if 'm-' in k]].interpolate(limit_direction='both', axis=1), dtype=np.float)
# combine signals from baby and mom
Xtrain = np.stack([train_db, train_dm], axis=2)
Xvalid = np.stack([valid_db, valid_dm], axis=2)

# convert labels to one-hot encodings
target_list = ['management', 'UA', 'variability', 'deceleration', 'FHB_class', 'management']
target_dict = {'management':3,
               'UA':2,
               'variability':2,
               'deceleration':4,
               'FHB_class':4,
               'management':3}
Ytrain = [(keras.utils.to_categorical(np.array(train_d[c_name]),num_classes=target_dict[c_name])) for c_name in target_list]
Yvalid = [(keras.utils.to_categorical(np.array(valid_d[c_name]),num_classes=target_dict[c_name])) for c_name in target_list]

#weight balancing or not
if FLAG.weight_balance:
    weight_list = []
    for idx, c_name in enumerate(target_list):
        y_integers = np.argmax(Ytrain[idx], axis=1)
        d_class_weight = compute_class_weight('balanced', np.unique(y_integers), y_integers)
        class_weight = dict(enumerate(d_class_weight))
        print('class weight: {0}'.format(class_weight))
        weight_list.append(class_weight)
else:
    weight_list = []
    for idx, c_name in enumerate(target_list):
        class_weight = dict()
        for i in range(target_dict[c_name]):
            class_weight[i] = 1
        print('class weight: {0}'.format(class_weight))
        weight_list.append(class_weight)

Xtest, Ytest =data_preprocess_test(Xvalid, Yvalid)


if FLAG.aug_fliplr:
    Xtrain_copy = Xtrain.copy()
    for i in range(len(Xtrain)):
        Xtrain_copy[i] = np.fliplr([Xtrain[i]])[0]
        print(i,'/',len(Xtrain), end= '\r')
    Xtrain = np.vstack((Xtrain, Xtrain_copy))
    Ytrain = np.vstack((Ytrain, Ytrain))

print('train:', len(train_d))
print('test:', len(valid_d))

  interactivity=interactivity, compiler=compiler, result=result)


class weight: {0: 0.5232974910394266, 1: 1.0354609929078014, 2: 8.11111111111111}
class weight: {0: 5.615384615384615, 1: 0.5488721804511278}
class weight: {0: 0.5658914728682171, 1: 4.294117647058823}
class weight: {0: 0.4397590361445783, 1: 1.9210526315789473, 2: 0.8690476190476191, 3: 18.25}
class weight: {0: 0.5069444444444444, 1: 36.5}
class weight: {0: 0.5232974910394266, 1: 1.0354609929078014, 2: 8.11111111111111}
train: 146
test: 61


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# cross validation

In [109]:
dd['multi_label'].groupby(dd['multi_label']).agg(['count'])

Unnamed: 0_level_0,count
multi_label,Unnamed: 1_level_1
0_0_0_2_0,1
0_1_0_0_0,14
0_1_0_0_1,1
0_1_0_1_0,21
0_1_0_2_0,130
0_1_0_2_1,8
1_0_0_0_0,1
1_0_0_2_0,5
1_0_1_0_0,1
1_1_0_2_0,77


## CTU

In [21]:
from sklearn.model_selection import StratifiedKFold
dd = d[d['ID'].str.contains('CTU_')]
dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
dd_split = dd[dd['multi_label_count']!=1]
dd_force_train = dd[dd['multi_label_count']==1]
# dd_split_force_to_sample = dd_split[dd_split['management']==2]
# dd_split = dd_split[dd_split['management']!=2]


X = dd_split.reset_index(drop=True)
y = X['multi_label_count']
skf = StratifiedKFold(n_splits=5, random_state=13, shuffle= True)
skf.get_n_splits(X, y)
print(skf)  

fold = 0
for train_index, test_index in skf.split(X, y):
    fold +=1
    print("[",fold,"]",len(train_index), len(test_index))
    X_train = X[X.index.isin(train_index)]
    X_test  = X[X.index.isin(test_index)]
    print("[",fold,"]", len(X_train), len(X_test))
    train_d = pd.concat([X_train, dd_force_train])
    valid_d = X_test
    print("[",fold,"]", len(train_d), len(valid_d) , '\n')
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))
    
#     sample_to_test = dd_split_force_to_sample.sample(n=2, random_state=fold)
#     sample_to_train = dd_split_force_to_sample[dd_split_force_to_sample['ID'].isin(sample_to_test['ID'])==False]
#     print(sample_to_test.index)
#     print(sample_to_train.index)
#     train_d = pd.concat([train_d, sample_to_train])
#     valid_d = pd.concat([valid_d, sample_to_test])
#     print(train_d['management'].groupby(train_d['management']).agg(['count']))
#     print(valid_d['management'].groupby(valid_d['management']).agg(['count']))

    print("=" *30)
    print("=" *30)
    print("=" *30)
    
    train_d.to_csv('./data/5_fold_0%d_train.csv' %(fold),index=False)
    valid_d.to_csv('./data/5_fold_0%d_test.csv' %(fold),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
[ 1 ] 288 78
[ 1 ] 288 78
[ 1 ] 298 78 

            count
management       
0             139
1              86
2              73
            count
management       
0              36
1              23
2              19
[ 2 ] 290 76
[ 2 ] 290 76
[ 2 ] 300 76 

            count
management       
0             140
1              87
2              73
            count
management       
0              35
1              22
2              19
[ 3 ] 296 70
[ 3 ] 296 70
[ 3 ] 306 70 

            count
management       
0             140
1              89
2              77
            count
management       
0              35
1              20
2              15
[ 4 ] 294 72
[ 4 ] 294 72
[ 4 ] 304 72 

            count
management       
0             141
1              88
2              75
            count
management       
0              34
1              21
2              17
[ 5 ] 296 70
[ 5 ] 296 70
[ 5 ] 306 70 

            coun

## CMU

In [201]:
from sklearn.model_selection import StratifiedKFold
dd = d[d['ID'].str.contains('CMU_')]
dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
dd_split = dd[dd['multi_label_count']!=1]
dd_force_train = dd[dd['multi_label_count']==1]
dd_split_force_to_sample = dd_split[dd_split['management']==2]
dd_split = dd_split[dd_split['management']!=2]


X = dd_split.reset_index(drop=True)
y = X['multi_label_count']
skf = StratifiedKFold(n_splits=5, random_state=13, shuffle= True)
skf.get_n_splits(X, y)
print(skf)  

fold = 0
for train_index, test_index in skf.split(X, y):
    fold +=1
    print("[",fold,"]",len(train_index), len(test_index))
    X_train = X[X.index.isin(train_index)]
    X_test  = X[X.index.isin(test_index)]
    print("[",fold,"]", len(X_train), len(X_test))
    train_d = pd.concat([X_train, dd_force_train])
    valid_d = X_test
    print("[",fold,"]", len(train_d), len(valid_d) , '\n')
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))
    
    sample_to_test = dd_split_force_to_sample.sample(n=2, random_state=fold)
    sample_to_train = dd_split_force_to_sample[dd_split_force_to_sample['ID'].isin(sample_to_test['ID'])==False]
    print(sample_to_test.index)
    print(sample_to_train.index)
    train_d = pd.concat([train_d, sample_to_train])
    valid_d = pd.concat([valid_d, sample_to_test])
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))

    print("=" *30)
    print("=" *30)
    print("=" *30)
    
    train_d.to_csv('./data/5_fold_0%d_train.csv' %(fold),index=False)
    valid_d.to_csv('./data/5_fold_0%d_test.csv' %(fold),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


StratifiedKFold(n_splits=5, random_state=13, shuffle=True)
[ 1 ] 153 43
[ 1 ] 153 43
[ 1 ] 159 43 

            count
management       
0             105
1              52
2               2
            count
management       
0              28
1              15
Int64Index([140, 93], dtype='int64')
Int64Index([92, 194, 195], dtype='int64')
            count
management       
0             105
1              52
2               5
            count
management       
0              28
1              15
2               2
[ 2 ] 156 40
[ 2 ] 156 40
[ 2 ] 162 40 

            count
management       
0             105
1              55
2               2
            count
management       
0              28
1              12
Int64Index([140, 195], dtype='int64')
Int64Index([92, 93, 194], dtype='int64')
            count
management       
0             105
1              55
2               5
            count
management       
0              28
1              12
2               2
[ 3 ] 158 38
[ 3 

In [202]:
from sklearn.model_selection import StratifiedKFold
dd = d[d['ID'].str.contains('CMU_')]
dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
dd_split = dd[dd['multi_label_count']!=1]
dd_force_train = dd[dd['multi_label_count']==1]
dd_split_force_to_sample = dd_split[dd_split['management']==2]
dd_split = dd_split[dd_split['management']!=2]


X = dd_split.reset_index(drop=True)
y = X['multi_label_count']
skf = StratifiedKFold(n_splits=5, random_state=14, shuffle= True)
skf.get_n_splits(X, y)
print(skf)  

fold = 0
for train_index, test_index in skf.split(X, y):
    fold +=1
    print("[",fold,"]",len(train_index), len(test_index))
    X_train = X[X.index.isin(train_index)]
    X_test  = X[X.index.isin(test_index)]
    print("[",fold,"]", len(X_train), len(X_test))
    train_d = pd.concat([X_train, dd_force_train])
    valid_d = X_test
    print("[",fold,"]", len(train_d), len(valid_d) , '\n')
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))
    
    sample_to_test = dd_split_force_to_sample.sample(n=2, random_state=fold)
    sample_to_train = dd_split_force_to_sample[dd_split_force_to_sample['ID'].isin(sample_to_test['ID'])==False]
    print(sample_to_test.index)
    print(sample_to_train.index)
    train_d = pd.concat([train_d, sample_to_train])
    valid_d = pd.concat([valid_d, sample_to_test])
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))

    print("=" *30)
    print("=" *30)
    print("=" *30)
    
    train_d.to_csv('./data/5_fold_0%d_train.csv' %(fold),index=False)
    valid_d.to_csv('./data/5_fold_0%d_test.csv' %(fold),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


StratifiedKFold(n_splits=5, random_state=14, shuffle=True)
[ 1 ] 153 43
[ 1 ] 153 43
[ 1 ] 159 43 

            count
management       
0             105
1              52
2               2
            count
management       
0              28
1              15
Int64Index([140, 93], dtype='int64')
Int64Index([92, 194, 195], dtype='int64')
            count
management       
0             105
1              52
2               5
            count
management       
0              28
1              15
2               2
[ 2 ] 155 41
[ 2 ] 155 41
[ 2 ] 161 41 

            count
management       
0             105
1              54
2               2
            count
management       
0              28
1              13
Int64Index([140, 195], dtype='int64')
Int64Index([92, 93, 194], dtype='int64')
            count
management       
0             105
1              54
2               5
            count
management       
0              28
1              13
2               2
[ 3 ] 158 38
[ 3 

In [19]:
from sklearn.model_selection import StratifiedKFold
dd = d[d['ID'].str.contains('CMU_')]
dd['multi_label'] = dd['management'].map(str)+'_'+dd['UA'].map(str)+'_'+dd['variability'].map(str)+'_'+dd['deceleration'].map(str)+'_'+dd['FHB_class'].map(str)
dd['multi_label_count'] = dd.groupby('multi_label')['multi_label'].transform('count')
dd_split = dd[dd['multi_label_count']!=1]
dd_force_train = dd[dd['multi_label_count']==1]
dd_split_force_to_sample = dd_split[dd_split['management']==2]
dd_split = dd_split[dd_split['management']!=2]


X = dd_split.reset_index(drop=True)
y = X['multi_label_count']
skf = StratifiedKFold(n_splits=5, random_state=15, shuffle= True)
skf.get_n_splits(X, y)
print(skf)  

fold = 0
for train_index, test_index in skf.split(X, y):
    fold +=1
    print("[",fold,"]",len(train_index), len(test_index))
    X_train = X[X.index.isin(train_index)]
    X_test  = X[X.index.isin(test_index)]
    print("[",fold,"]", len(X_train), len(X_test))
    train_d = pd.concat([X_train, dd_force_train])
    valid_d = X_test
    print("[",fold,"]", len(train_d), len(valid_d) , '\n')
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))
    
    sample_to_test = dd_split_force_to_sample.sample(n=2, random_state=fold)
    sample_to_train = dd_split_force_to_sample[dd_split_force_to_sample['ID'].isin(sample_to_test['ID'])==False]
    print(sample_to_test.index)
    print(sample_to_train.index)
    train_d = pd.concat([train_d, sample_to_train])
    valid_d = pd.concat([valid_d, sample_to_test])
    print(train_d['management'].groupby(train_d['management']).agg(['count']))
    print(valid_d['management'].groupby(valid_d['management']).agg(['count']))

    print("=" *30)
    print("=" *30)
    print("=" *30)
    
    train_d.to_csv('./data/5_fold_0%d_train.csv' %(fold),index=False)
    valid_d.to_csv('./data/5_fold_0%d_test.csv' %(fold),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


StratifiedKFold(n_splits=5, random_state=15, shuffle=True)
[ 1 ] 154 42
[ 1 ] 154 42
[ 1 ] 160 42 

            count
management       
0             105
1              53
2               2
            count
management       
0              28
1              14
Int64Index([140, 93], dtype='int64')
Int64Index([92, 194, 195], dtype='int64')
            count
management       
0             105
1              53
2               5
            count
management       
0              28
1              14
2               2
[ 2 ] 156 40
[ 2 ] 156 40
[ 2 ] 162 40 

            count
management       
0             105
1              55
2               2
            count
management       
0              28
1              12
Int64Index([140, 195], dtype='int64')
Int64Index([92, 93, 194], dtype='int64')
            count
management       
0             105
1              55
2               5
            count
management       
0              28
1              12
2               2
[ 3 ] 158 38
[ 3 