In [1]:
!pip install pyedflib
!pip install lightgbm
!pip install tqdm
!pip install scikit-learn
!pip install matplotlib
!pip install pandas




In [2]:
import numpy as np
import pyedflib
import os 
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from scipy.interpolate import interp1d
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from sklearn.metrics import confusion_matrix,accuracy_score
import lightgbm

%matplotlib inline


edf_path = './edf/'
annot_path = './annotation/'
base_path = './data/MIT-BIH/'

# Data Prepare

In [3]:
def preprocessing(data) : 
    a= pd.DataFrame(data)
    Q1 = float(a.quantile(0.25))
    Q3 = float(a.quantile(0.75))
    IQR = Q3 - Q1
    print(Q1 - 1.5*(IQR) , Q3 + 1.5*IQR)
    
    allow_min = Q1 - 1.5*(IQR)
    allow_max = Q3 + 1.5*IQR
    
    data = [data/allow_max]
    
    return data

def std_4sigma(data) : 
    data_standadized_np = (data - np.mean(data)) / np.std(data)
    sigma_pos = np.mean(data) + 3*np.std(data)
    sigma_neg = np.mean(data) - (3*np.std(data))
    data[data > sigma_pos] = sigma_pos
    data[data < sigma_neg] = sigma_neg
    return data

def normalized_data(data) : 
    
    import scipy.signal
    
    lowpass = scipy.signal.butter(1, 15.0/(360.0/2.0), 'low')
    highpass = scipy.signal.butter(1, 5.0/(360.0/2.0), 'high')
    # TODO: Could use an actual bandpass filter
    ecg_low = scipy.signal.filtfilt(*lowpass, x=data)
    data = scipy.signal.filtfilt(*highpass, x=ecg_low)
    
    data=std_4sigma(data)
    
    max_v = max(data)
    min_v = min(data)
    
    if(max_v - min_v > 3) : 
        print(max_v, min_v)
        
    
    ave = np.average(data) 
    
    data = (data - ave ) / (max_v - min_v) * 2
    
    return data 


                                                       

In [4]:
sigbufs_data={}
peaks_data={}
labels_data={}

for (path, dir, files) in os.walk(base_path+edf_path):
    for filename in files:
        ext = os.path.splitext(filename)[-1]
        if ext == '.edf':
            f= pyedflib.EdfReader(path+filename)
            ann = open(base_path+annot_path+filename.split('.')[0]+'.ans')
#             print(ann)
            file_num = filename.split('.')[0]

#             sigbufs= f.readSignal(0)
            sigbufs = '_'
            for i in range(f.signals_in_file) : 
                if (f.getLabel(i) in ['MLII', 'ML2']) : 
                    sigbufs = f.readSignal(i)
            if(sigbufs == '_') : 
                print("cannot find MLII in " + str(path+filename))
                continue
#             preprocessing(sigbufs)
            sigbufs_data[file_num] = sigbufs
            
            seq_num=[]
            labels=[]
            lines = ann.readlines()
            for line in lines:
                line = line.strip()
#                 print(line)
                sep = line.split()
#                 print(sep)
                seq_num.append(sep[1])
                labels.append(sep[2])
            peaks_data[file_num] = seq_num
            labels_data[file_num] = labels

print(sigbufs_data['100'][0:20])
print(peaks_data['100'][0:20])
print(labels_data['100'][0:20])
            



cannot find MLII in ./data/MIT-BIH/./edf/102.edf
cannot find MLII in ./data/MIT-BIH/./edf/104.edf
[-0.145 -0.145 -0.145 -0.145 -0.145 -0.145 -0.145 -0.145 -0.12  -0.135
 -0.145 -0.15  -0.16  -0.155 -0.16  -0.175 -0.18  -0.185 -0.17  -0.155]
['18', '77', '370', '662', '946', '1231', '1515', '1809', '2044', '2402', '2706', '2998', '3282', '3560', '3862', '4170', '4466', '4764', '5060', '5346']
['+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']


In [5]:
print(sigbufs_data.keys())
print(len(sigbufs_data.keys()))

dict_keys(['100', '101', '103', '105', '106', '107', '108', '109', '111', '112', '113', '114', '115', '116', '117', '118', '119', '121', '122', '123', '124', '200', '201', '202', '203', '205', '207', '208', '209', '210', '212', '213', '214', '215', '217', '219', '220', '221', '222', '223', '228', '230', '231', '232', '233', '234'])
46


### split train data , test data

In [6]:
# test_person=['105','109','118','119','200','202','210','214','221','223']
# train_person = []
# for item in list(sigbufs_data.keys()):
#     if item not in test_person : 
#         train_person.append(item)
        
# print(train_person)
#Source From Chap.7 in Classification Tehcniques for Medical Image.. by Nilanjan Dey 
ds1=['101','106','108','109','112','114','115','116','118','119','122','124','201','203','205','207','208','209','215','220','223','230']
ds2=['100','103','105','111','113','117','121','123','200','202','210','212','213','214','219','221','222','228','231','232','233','234']


In [7]:
# for key in sigbufs_data.keys() :
#     for peak in peaks_data[key] : 
#          print(str(peak), sigbufs_data[key][int(peak) - 12 :int(peak) + 12 ])

### Label Change to Num ( 0 : N, 1: S, 2: V , 3: f, 4: Q) 

In [8]:
# label_change = {'N' : 0, 'L' : 0, 'R' : 0, 
#                 'A' : 1 , 'a' : 1, 'J' : 1, 'S' : 1 , 'e' : 1 , 'j' : 1 , 'n' : 1 ,
#                 'V' : 2, 'r' : 2, 'E' : 2, '!' : 2, 
#                 'F' : 3, 
#                 '/' : 4, 'f':4, 'Q': 4,
#                 '+' : -1, '~' : -1 , '|' : -1, 'x' : -1 , '[' : -1, ']' : -1 , '"': -1 }

In [9]:
label_change = {'N' : 0, 'L' : 0, 'R' : 0, 
                'A' : 1 , 'a' : 1, 'J' : 1, 'S' : 1 , 'e' : 1 , 'j' : 1 , 'n' : 5 ,
                'V' : 2, 'r' : 5, 'E' : 2, '!' : 5, 
                'F' : 3, 
                '/' : 4, 'f':4, 'Q': 4, 'u' : 4, 'P' : 4,
                '+' : -1, '~' : -1 , '|' : -1, 'x' : -1 , '[' : -1, ']' : -1 , '"': -1 }

In [10]:
change_label = {}
for key in labels_data.keys():
    change_label[key] = []
    for labels in labels_data[key] : 
        for label in labels : 
            change_label[key].append(label_change[label])
for key in labels_data.keys():
    print(labels_data[key][0:20])
    print(change_label[key][0:20])

['+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'A', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['~', '+', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N']
[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['+', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/']
[-1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
['+', 'N', 'N', 'N', 

# Pandas Prepare

In [11]:
new_peaks={}
new_label={}
for key in peaks_data.keys():
    if(len(peaks_data[key]) != len(change_label[key])) : 
        print(len(peaks_data[key]))   
        print(len(change_label[key]))
        break
    new_peaks[key] = []
    new_label[key] = []
    for idx,data in enumerate(change_label[key]) : 
        if(data == -1) : 
            continue
        new_peaks[key].append(peaks_data[key][idx])
        new_label[key].append(change_label[key][idx])

In [12]:
for key in peaks_data.keys():
    print(len(new_peaks[key]),len(peaks_data[key]), "X" if (len(new_peaks[key]) != len(peaks_data[key])) else "O" )
    

2273 2274 X
1865 1874 X
2084 2091 X
2572 2691 X
2027 2098 X
2137 2140 X
1763 1824 X
2532 2535 X
2124 2133 X
2539 2550 X
1795 1796 X
1879 1890 X
1953 1962 X
2412 2421 X
1535 1539 X
2278 2301 X
1987 2094 X
1863 1876 X
2476 2479 X
1518 1519 X
1619 1634 X
2601 2792 X
1963 2039 X
2136 2146 X
2980 3108 X
2656 2672 X
2332 2385 X
2955 3040 X
3005 3052 X
2650 2685 X
2748 2763 X
3251 3294 X
2262 2297 X
3363 3400 X
2208 2280 X
2154 2312 X
2048 2069 X
2427 2462 X
2483 2634 X
2605 2643 X
2053 2141 X
2256 2466 X
1571 2011 X
1780 1816 X
3079 3152 X
2753 2764 X


In [13]:
modified_peaks = {}
for key in new_peaks.keys():
    modified_peaks[key] = []
    for peak in new_peaks[key] :
        peak=int(peak)
        max_peak = peak
        for idx in range(peak-12, peak+12) : 
            if(sigbufs_data[key][idx] > sigbufs_data[key][max_peak]):
                max_peak = idx
        modified_peaks[key].append(max_peak)
        
print(new_peaks['100'][0:20])
print(modified_peaks['100'][0:20])
print(len(modified_peaks['100']),len(new_label['100']))

print("==========================")

['77', '370', '662', '946', '1231', '1515', '1809', '2044', '2402', '2706', '2998', '3282', '3560', '3862', '4170', '4466', '4764', '5060', '5346', '5633']
[77, 370, 663, 947, 1231, 1515, 1809, 2045, 2403, 2706, 2998, 3283, 3560, 3863, 4171, 4466, 4765, 5061, 5347, 5634]
2273 2273


In [14]:
columns=['q','q_idx','r','s','s_idx','rr_interval','rr_over_L5','rr_interval_all','peak_over_L5','peak_over_all','annot','annot2']
print(columns)

['q', 'q_idx', 'r', 's', 's_idx', 'rr_interval', 'rr_over_L5', 'rr_interval_all', 'peak_over_L5', 'peak_over_all', 'annot', 'annot2']


In [15]:
train_pandas_path = './train_person_1209_std_4.csv'

# pandas_data = pd.DataFrame(columns=columns ) 
train_pandas_data = None
pandas_idx = 0

# if (os.path.exists(train_pandas_path)) : 
if False : 
    train_pandas_data = pd.read_csv(train_pandas_path,index_col=0)
else : 
    
    for edf_filename in ds1 : 
        peaks = modified_peaks[edf_filename]
        annots = new_label[edf_filename]
        sigbuf= sigbufs_data[edf_filename]
        sigbuf = normalized_data(sigbuf)
        #sigbuf = std_4sigma(sigbuf)
        temp = pd.DataFrame(columns=columns )
        rr_intervals=[]
        peaks_all=[]
        for idx in (range(1,len(peaks) -1 )) : 
            rr_intervals.append(peaks[idx] - peaks[idx-1])
            peaks_all.append(sigbuf[peaks[idx]])
        for idx in tqdm(range(len(peaks)-1)) : 
            if(idx < 5) : 
                continue
            if(annots[idx] == 5) : 
                continue
            
            r=sigbuf[peaks[idx]]
            q = min(sigbuf[peaks[idx] - 48 : peaks[idx]])
            q_idx = (48 - np.argmin(sigbuf[peaks[idx] - 48 : peaks[idx]])) / 48.0
            s = min(sigbuf[peaks[idx] : peaks[idx] + 48 ])
            s_idx = (np.argmin(sigbuf[peaks[idx] : peaks[idx] + 48])) / 48.0
            
            rr_interval = peaks[idx] - peaks[idx-1]
            rr_over_L5 = (peaks[idx] - peaks[idx-1]) / ((peaks[idx] - peaks[idx-5]) /5 )
            rr_over_all = (peaks[idx] - peaks[idx-1]) / np.average(rr_intervals) 
            peak_over_L5 = sigbuf[peaks[idx]] /\
                           ((sigbuf[peaks[idx-5]]+sigbuf[peaks[idx-4]]+sigbuf[peaks[idx-3]]+sigbuf[peaks[idx-2]]+sigbuf[peaks[idx-1]])/5)
            peak_over_all = sigbuf[peaks[idx]] / np.average(peaks_all)
            temp.loc[pandas_idx] = [q,q_idx,r,s,s_idx,rr_interval,rr_over_L5,rr_over_all,peak_over_L5, peak_over_all, annots[idx], 1 if annots[idx] == 0.0 else 0 ]
            pandas_idx += 1
    #         print(rr_interval, peaks[idx], peaks[idx-1])
    #         print(q,r,s,rr_interval,peak_over_L5,annots[idx])
    #         print(pandas_data)
        if (train_pandas_data is None ) : 
            train_pandas_data = temp
        else : 
            train_pandas_data = pd.concat([train_pandas_data,temp])

    #         break
    print(train_pandas_data)
    train_pandas_data.to_csv(train_pandas_path, mode='w')
    train_pandas_data = pd.read_csv(train_pandas_path,index_col=0)

  b = a[a_slice]
100%|█████████████████████████████████████████████████████████████████████████████| 1864/1864 [00:01<00:00, 970.42it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2026/2026 [00:02<00:00, 956.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1762/1762 [00:01<00:00, 982.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2531/2531 [00:02<00:00, 921.27it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2538/2538 [00:02<00:00, 918.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1878/1878 [00:01<00:00, 968.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1952/1952 [00:02<00:00, 959.63it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2411/2411 [00:02<00:00, 925.37it/s]
100%|██████████████████

              q     q_idx         r         s     s_idx  rr_interval  \
0     -0.146885  0.291667  0.637103 -0.131476  0.354167        344.0   
1     -0.163234  0.291667  0.704428 -0.151494  0.333333        324.0   
2     -0.186917  0.291667  0.767824 -0.147500  0.375000        313.0   
3     -0.175878  0.270833  0.716078 -0.139979  0.541667        312.0   
4     -0.170045  0.291667  0.719258 -0.141607  0.416667        311.0   
...         ...       ...       ...       ...       ...          ...   
50884 -0.149599  0.354167  0.890019 -0.710607  0.208333        243.0   
50885 -0.175149  0.375000  0.923847 -0.658806  0.187500        244.0   
50886 -0.160607  0.395833  0.883755 -0.655392  0.208333        234.0   
50887 -0.207656  0.375000  0.976432 -0.642695  0.208333        232.0   
50888 -0.192327  0.395833  0.976436 -0.686411  0.187500        234.0   

       rr_over_L5  rr_interval_all  peak_over_L5  peak_over_all  annot  annot2  
0        1.055215         0.987037      0.970119      

In [16]:
test_pandas_path = './test_person_1209_std_4.csv'

# pandas_data = pd.DataFrame(columns=columns ) 

test_pandas_data = None
pandas_idx = 0

# if (os.path.exists(test_pandas_path)) : 
if False : 
    test_pandas_data = pd.read_csv(test_pandas_path,index_col=0)
else : 
    
    for edf_filename in ds2 : 
        peaks = modified_peaks[edf_filename]
        annots = new_label[edf_filename]
        sigbuf= sigbufs_data[edf_filename]
        sigbuf = normalized_data(sigbuf)
        #sigbuf = std_4sigma(sigbuf)
        temp = pd.DataFrame(columns=columns )
        rr_intervals=[]
        peaks_all=[]
        for idx in (range(1,len(peaks) -1 )) : 
            rr_intervals.append(peaks[idx] - peaks[idx-1])
            peaks_all.append(sigbuf[peaks[idx]])
        for idx in tqdm(range(len(peaks)-1)) : 
            if(idx < 5) : 
                continue
            if(annots[idx] == 5) : 
                continue
            
            r=sigbuf[peaks[idx]]
            q = min(sigbuf[peaks[idx] - 48 : peaks[idx]])
            q_idx = (48 - np.argmin(sigbuf[peaks[idx] - 48 : peaks[idx]])) / 48.0
            s = min(sigbuf[peaks[idx] : peaks[idx] + 48 ])
            s_idx = (np.argmin(sigbuf[peaks[idx] : peaks[idx] + 48])) / 48.0
            
            rr_interval = peaks[idx] - peaks[idx-1]
            rr_over_L5 = (peaks[idx] - peaks[idx-1]) / ((peaks[idx] - peaks[idx-5]) /5 )
            rr_over_all = (peaks[idx] - peaks[idx-1]) / np.average(rr_intervals) 
            peak_over_L5 = sigbuf[peaks[idx]] /\
                           ((sigbuf[peaks[idx-5]]+sigbuf[peaks[idx-4]]+sigbuf[peaks[idx-3]]+sigbuf[peaks[idx-2]]+sigbuf[peaks[idx-1]])/5)
            peak_over_all = sigbuf[peaks[idx]] / np.average(peaks_all)
            temp.loc[pandas_idx] = [q,q_idx,r,s,s_idx,rr_interval,rr_over_L5,rr_over_all,peak_over_L5, peak_over_all, annots[idx], 1 if annots[idx] == 0.0 else 0 ]
            pandas_idx += 1
    #         print(rr_interval, peaks[idx], peaks[idx-1])
    #         print(q,r,s,rr_interval,peak_over_L5,annots[idx])
    #         print(pandas_data)
        if (test_pandas_data is None ) : 
            test_pandas_data = temp
        else : 
            test_pandas_data = pd.concat([test_pandas_data,temp])

    #         break
    print(test_pandas_data)
    test_pandas_data.to_csv(test_pandas_path, mode='w')
    test_pandas_data = pd.read_csv(test_pandas_path,index_col=0)

  b = a[a_slice]
100%|█████████████████████████████████████████████████████████████████████████████| 2272/2272 [00:03<00:00, 733.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2083/2083 [00:02<00:00, 892.86it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2571/2571 [00:02<00:00, 884.62it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2123/2123 [00:02<00:00, 915.68it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1794/1794 [00:01<00:00, 956.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1534/1534 [00:01<00:00, 987.26it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1862/1862 [00:01<00:00, 957.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1517/1517 [00:01<00:00, 994.68it/s]
100%|██████████████████

              q     q_idx         r         s     s_idx  rr_interval  \
0     -0.164335  0.250000  0.570754 -0.105373  0.250000        284.0   
1     -0.180710  0.250000  0.617327 -0.113496  0.270833        294.0   
2     -0.183023  0.250000  0.576815 -0.107303  0.250000        236.0   
3     -0.181602  0.270833  0.630844 -0.146964  0.229167        358.0   
4     -0.177556  0.250000  0.597671 -0.116347  0.250000        303.0   
...         ...       ...       ...       ...       ...          ...   
49575 -0.248560  0.312500  0.916344 -0.194696  0.312500        241.0   
49576 -0.246322  0.291667  0.927760 -0.194603  0.354167        240.0   
49577 -0.244871  0.333333  0.927357 -0.208613  0.354167        243.0   
49578 -0.233126  0.312500  0.871498 -0.194633  0.375000        252.0   
49579 -0.232939  0.312500  0.914509 -0.208317  0.291667        245.0   

       rr_over_L5  rr_interval_all  peak_over_L5  peak_over_all  annot  annot2  
0        0.987483         0.992776      0.978384      

### Make Indivisual data

In [17]:
# indi_pandas = './y256_data_indi_std_4sigma'

# # pandas_data = pd.DataFrame(columns=columns ) 


# indi_pandas_data = None
# pandas_idx = 0

# for edf_filename in modified_peaks.keys() : 
#     peaks = modified_peaks[edf_filename]
#     annots = new_label[edf_filename]
#     sigbuf= sigbufs_data[edf_filename]
#     sigbuf= normalized_data(sigbuf)
# #     sigbuf= std_4sigma(sigbuf)
#     temp = pd.DataFrame(columns=columns )
#     for idx in tqdm(range(len(peaks)-1)) : 
#         if(idx < 5) : 
#             continue
#         if(annots[idx] == 5) : 
#             continue
#         data = sigbuf[peaks[idx-1] : peaks[idx+1]]
#         x = np.linspace(0, peaks[idx+1]-peaks[idx-1], num=peaks[idx+1]-peaks[idx-1])
#         y = data
#         f = interp1d(x, y)
#         xnew = np.linspace(0, len(x), num=256,endpoint=True)
#         data_int256 = f(xnew)
#         data_int256 = data_int256.tolist()

#         r_idx = int((peaks[idx] - peaks[idx-1])/(peaks[idx+1]-peaks[idx-1]) * 256)
# #         print(r_idx)
# #         r = sigbuf[peaks[idx]]
# #         q = min(sigbuf[peaks[idx]-12 : peaks[idx]])
# #         s = min(sigbuf[peaks[idx] : peaks[idx] + 12])
#         try : 
#             r_idx += np.argmax(data_int256[r_idx -5 :r_idx +5 ]) + 5
# #             if(r_idx > len(data_int256)) : 
# #                 continue
#             r = data_int256[r_idx]
#         except : 
#             print (peaks[idx-1], peaks[idx], peaks[idx+1])
#             print (annots[idx-1], annots[idx], annots[idx+1])
#             print(r_idx)
#             print(data_int256[r_idx -5 :r_idx +5 ])
# #         print(len(data_int60),data_int60)
# #         print(r_idx-12, r_idx)
# #         print(data_int60[r_idx -12 : r_idx])
#         q = min(data_int256[r_idx -12 : r_idx])
#         s = min(data_int256[r_idx : r_idx+12])
#         rr_interval = peaks[idx] - peaks[idx-1]
#         rr_over_L5 = (peaks[idx] - peaks[idx-1]) / ((peaks[idx] - peaks[idx-5]) /5 )
#         peak_over_L5 = sigbuf[peaks[idx]] /\
#                        ((sigbuf[peaks[idx-5]]+sigbuf[peaks[idx-4]]+sigbuf[peaks[idx-3]]+sigbuf[peaks[idx-2]]+sigbuf[peaks[idx-1]])/5)
#         data_int256.extend([q,r,s,rr_interval,rr_over_L5,peak_over_L5,annots[idx],0 if annots[idx]==0.0 else 1 ])
#         temp.loc[pandas_idx] = data_int256
#         pandas_idx += 1
# #         print(rr_interval, peaks[idx], peaks[idx-1])
# #         print(q,r,s,rr_interval,peak_over_L5,annots[idx])
# #         print(pandas_data)
#     temp.to_csv(indi_pandas+'_'+edf_filename+'.csv', mode='w')


In [18]:
print(train_pandas_data)

              q     q_idx         r         s     s_idx  rr_interval  \
0     -0.146885  0.291667  0.637103 -0.131476  0.354167        344.0   
1     -0.163234  0.291667  0.704428 -0.151494  0.333333        324.0   
2     -0.186917  0.291667  0.767824 -0.147500  0.375000        313.0   
3     -0.175878  0.270833  0.716078 -0.139979  0.541667        312.0   
4     -0.170045  0.291667  0.719258 -0.141607  0.416667        311.0   
...         ...       ...       ...       ...       ...          ...   
50884 -0.149599  0.354167  0.890019 -0.710607  0.208333        243.0   
50885 -0.175149  0.375000  0.923847 -0.658806  0.187500        244.0   
50886 -0.160607  0.395833  0.883755 -0.655392  0.208333        234.0   
50887 -0.207656  0.375000  0.976432 -0.642695  0.208333        232.0   
50888 -0.192327  0.395833  0.976436 -0.686411  0.187500        234.0   

       rr_over_L5  rr_interval_all  peak_over_L5  peak_over_all  annot  annot2  
0        1.055215         0.987037      0.970119      

In [19]:
print(test_pandas_data)

              q     q_idx         r         s     s_idx  rr_interval  \
0     -0.164335  0.250000  0.570754 -0.105373  0.250000        284.0   
1     -0.180710  0.250000  0.617327 -0.113496  0.270833        294.0   
2     -0.183023  0.250000  0.576815 -0.107303  0.250000        236.0   
3     -0.181602  0.270833  0.630844 -0.146964  0.229167        358.0   
4     -0.177556  0.250000  0.597671 -0.116347  0.250000        303.0   
...         ...       ...       ...       ...       ...          ...   
49575 -0.248560  0.312500  0.916344 -0.194696  0.312500        241.0   
49576 -0.246322  0.291667  0.927760 -0.194603  0.354167        240.0   
49577 -0.244871  0.333333  0.927357 -0.208613  0.354167        243.0   
49578 -0.233126  0.312500  0.871498 -0.194633  0.375000        252.0   
49579 -0.232939  0.312500  0.914509 -0.208317  0.291667        245.0   

       rr_over_L5  rr_interval_all  peak_over_L5  peak_over_all  annot  annot2  
0        0.987483         0.992776      0.978384      

In [20]:
new_test_data = pd.DataFrame(columns=columns)
cnt_1 = 0 
cnt_0 = 0
len(test_pandas_data) 
for i in range(len(test_pandas_data)) : 
    item = test_pandas_data.loc[i]
    if(item["annot2"] == 0 ):
        new_test_data.loc[cnt_1+cnt_0] = item
        cnt_0 += 1 
    else : 
        if(cnt_1 >= cnt_0) : continue 
        new_test_data.loc[cnt_1 + cnt_0] = item
        cnt_1 += 1 
        
print(len(new_test_data))
print(new_test_data.loc[1])
new_test_data.to_csv("./new_test_data.csv", mode='w')

11310
q                   -0.181602
q_idx                0.270833
r                    0.630844
s                   -0.146964
s_idx                0.229167
rr_interval        358.000000
rr_over_L5           1.229396
rr_interval_all      1.251457
peak_over_L5         1.091676
peak_over_all        0.989801
annot                0.000000
annot2               1.000000
Name: 1, dtype: float64


In [21]:
#train_pandas_data
#test_pandas_data
train_pandas_data["annot"] = train_pandas_data["annot"].astype(int)
train_pandas_data["annot2"] = train_pandas_data["annot2"].astype(int)
test_pandas_data["annot"] = test_pandas_data["annot"].astype(int)
test_pandas_data["annot2"] = test_pandas_data["annot2"].astype(int)

new_test_data["annot"] = new_test_data["annot"].astype(int)
new_test_data["annot2"] = new_test_data["annot2"].astype(int)

y_train = train_pandas_data["annot"].values
y_test = test_pandas_data["annot"].values
y_train2 = train_pandas_data["annot2"].values
y_test2 = test_pandas_data["annot2"].values
X_train = train_pandas_data.drop(["annot","annot2"],1)
X_test = test_pandas_data.drop(["annot","annot2"],1)

new_X_test = new_test_data.drop(["annot","annot2"],1)
new_y_test = new_test_data["annot2"].values

lgb_train = lgb.Dataset(X_train, y_train2)
lgb_eval = lgb.Dataset(X_test, y_test2)
# lgb_eval = lgb.Dataset(new_X_test, new_y_test)

In [22]:
params = {
          "objective" : "multiclass",
          "num_class" : 2,
          "metric" : "multi_logloss",
          "num_leaves" : 256,
          "max_depth": -1,
          "learning_rate" : 0.0001,
          "bagging_fraction" : 0.3, 
          "feature_fraction" : 0.3,  
          "bagging_freq" : 5,        
          "bagging_seed" : 1,
          "pos_bagging_fraction" : 0.113,
          "neg_bagging_fraction" : 1.0,
          "boosting" : "rf", 
          "verbosity" : -1
           }

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)
#                 init_model='./model_DS1_DS2_190928.txt')

print('Saving model...')
# save model to file
gbm.save_model('./model_train_1209_1.txt')

print('Starting predicting...')
# predict


Starting training...
[1]	valid_0's multi_logloss: 0.221687
Training until validation scores don't improve for 500 rounds
[2]	valid_0's multi_logloss: 0.189512
[3]	valid_0's multi_logloss: 0.175317
[4]	valid_0's multi_logloss: 0.173274
[5]	valid_0's multi_logloss: 0.168705
[6]	valid_0's multi_logloss: 0.172068
[7]	valid_0's multi_logloss: 0.17448
[8]	valid_0's multi_logloss: 0.178049
[9]	valid_0's multi_logloss: 0.176291
[10]	valid_0's multi_logloss: 0.178386
[11]	valid_0's multi_logloss: 0.180394
[12]	valid_0's multi_logloss: 0.177017
[13]	valid_0's multi_logloss: 0.17453
[14]	valid_0's multi_logloss: 0.176266
[15]	valid_0's multi_logloss: 0.174529
[16]	valid_0's multi_logloss: 0.174953
[17]	valid_0's multi_logloss: 0.177472
[18]	valid_0's multi_logloss: 0.174887
[19]	valid_0's multi_logloss: 0.174709
[20]	valid_0's multi_logloss: 0.173639
[21]	valid_0's multi_logloss: 0.174506
[22]	valid_0's multi_logloss: 0.173758
[23]	valid_0's multi_logloss: 0.174164
[24]	valid_0's multi_logloss: 0

[207]	valid_0's multi_logloss: 0.175937
[208]	valid_0's multi_logloss: 0.175674
[209]	valid_0's multi_logloss: 0.175591
[210]	valid_0's multi_logloss: 0.175741
[211]	valid_0's multi_logloss: 0.175768
[212]	valid_0's multi_logloss: 0.175495
[213]	valid_0's multi_logloss: 0.175563
[214]	valid_0's multi_logloss: 0.175875
[215]	valid_0's multi_logloss: 0.175877
[216]	valid_0's multi_logloss: 0.175736
[217]	valid_0's multi_logloss: 0.175648
[218]	valid_0's multi_logloss: 0.175509
[219]	valid_0's multi_logloss: 0.175595
[220]	valid_0's multi_logloss: 0.175715
[221]	valid_0's multi_logloss: 0.175638
[222]	valid_0's multi_logloss: 0.175574
[223]	valid_0's multi_logloss: 0.175154
[224]	valid_0's multi_logloss: 0.175046
[225]	valid_0's multi_logloss: 0.174916
[226]	valid_0's multi_logloss: 0.174716
[227]	valid_0's multi_logloss: 0.17456
[228]	valid_0's multi_logloss: 0.174465
[229]	valid_0's multi_logloss: 0.174503
[230]	valid_0's multi_logloss: 0.174438
[231]	valid_0's multi_logloss: 0.174254
[

[412]	valid_0's multi_logloss: 0.172737
[413]	valid_0's multi_logloss: 0.172782
[414]	valid_0's multi_logloss: 0.172806
[415]	valid_0's multi_logloss: 0.172601
[416]	valid_0's multi_logloss: 0.172542
[417]	valid_0's multi_logloss: 0.172611
[418]	valid_0's multi_logloss: 0.172569
[419]	valid_0's multi_logloss: 0.172438
[420]	valid_0's multi_logloss: 0.17239
[421]	valid_0's multi_logloss: 0.172334
[422]	valid_0's multi_logloss: 0.172325
[423]	valid_0's multi_logloss: 0.17245
[424]	valid_0's multi_logloss: 0.17251
[425]	valid_0's multi_logloss: 0.172397
[426]	valid_0's multi_logloss: 0.172408
[427]	valid_0's multi_logloss: 0.172373
[428]	valid_0's multi_logloss: 0.172268
[429]	valid_0's multi_logloss: 0.172342
[430]	valid_0's multi_logloss: 0.172421
[431]	valid_0's multi_logloss: 0.172344
[432]	valid_0's multi_logloss: 0.17227
[433]	valid_0's multi_logloss: 0.172179
[434]	valid_0's multi_logloss: 0.172114
[435]	valid_0's multi_logloss: 0.172113
[436]	valid_0's multi_logloss: 0.172058
[437

In [23]:
params = {
          "objective" : "multiclass",
          "num_class" : 2,
          "metric" : "multi_logloss",
          "num_leaves" : 256,
          "max_depth": -1,
          "learning_rate" : 0.0001,
          "bagging_fraction" : 0.3, 
          "feature_fraction" : 0.3,  
          "bagging_freq" : 5,        
          "bagging_seed" : 2,
          "pos_bagging_fraction" : 0.113,
          "neg_bagging_fraction" : 1.0,
          "boosting" : "rf", 
          "verbosity" : -1
           }

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)
#                 init_model='./model_DS1_DS2_190928.txt')

print('Saving model...')
# save model to file
gbm.save_model('./model_train_1209_2.txt')

print('Starting predicting...')
# predict

Starting training...
[1]	valid_0's multi_logloss: 0.243367
Training until validation scores don't improve for 500 rounds
[2]	valid_0's multi_logloss: 0.196437
[3]	valid_0's multi_logloss: 0.191971
[4]	valid_0's multi_logloss: 0.188015
[5]	valid_0's multi_logloss: 0.183321
[6]	valid_0's multi_logloss: 0.184451
[7]	valid_0's multi_logloss: 0.185998
[8]	valid_0's multi_logloss: 0.189113
[9]	valid_0's multi_logloss: 0.185098
[10]	valid_0's multi_logloss: 0.185735
[11]	valid_0's multi_logloss: 0.185813
[12]	valid_0's multi_logloss: 0.182332
[13]	valid_0's multi_logloss: 0.179516
[14]	valid_0's multi_logloss: 0.180036
[15]	valid_0's multi_logloss: 0.17888
[16]	valid_0's multi_logloss: 0.178558
[17]	valid_0's multi_logloss: 0.180963
[18]	valid_0's multi_logloss: 0.179175
[19]	valid_0's multi_logloss: 0.177352
[20]	valid_0's multi_logloss: 0.176253
[21]	valid_0's multi_logloss: 0.177777
[22]	valid_0's multi_logloss: 0.17679
[23]	valid_0's multi_logloss: 0.177321
[24]	valid_0's multi_logloss: 0

[207]	valid_0's multi_logloss: 0.176849
[208]	valid_0's multi_logloss: 0.176604
[209]	valid_0's multi_logloss: 0.176599
[210]	valid_0's multi_logloss: 0.176781
[211]	valid_0's multi_logloss: 0.176778
[212]	valid_0's multi_logloss: 0.176591
[213]	valid_0's multi_logloss: 0.176665
[214]	valid_0's multi_logloss: 0.176921
[215]	valid_0's multi_logloss: 0.176962
[216]	valid_0's multi_logloss: 0.176842
[217]	valid_0's multi_logloss: 0.176744
[218]	valid_0's multi_logloss: 0.176623
[219]	valid_0's multi_logloss: 0.176534
[220]	valid_0's multi_logloss: 0.17656
[221]	valid_0's multi_logloss: 0.176372
[222]	valid_0's multi_logloss: 0.1763
[223]	valid_0's multi_logloss: 0.175818
[224]	valid_0's multi_logloss: 0.175678
[225]	valid_0's multi_logloss: 0.175468
[226]	valid_0's multi_logloss: 0.175328
[227]	valid_0's multi_logloss: 0.175191
[228]	valid_0's multi_logloss: 0.17517
[229]	valid_0's multi_logloss: 0.175166
[230]	valid_0's multi_logloss: 0.175002
[231]	valid_0's multi_logloss: 0.174798
[232

[412]	valid_0's multi_logloss: 0.173106
[413]	valid_0's multi_logloss: 0.173167
[414]	valid_0's multi_logloss: 0.173168
[415]	valid_0's multi_logloss: 0.172978
[416]	valid_0's multi_logloss: 0.172904
[417]	valid_0's multi_logloss: 0.172981
[418]	valid_0's multi_logloss: 0.172922
[419]	valid_0's multi_logloss: 0.17281
[420]	valid_0's multi_logloss: 0.17272
[421]	valid_0's multi_logloss: 0.172644
[422]	valid_0's multi_logloss: 0.172629
[423]	valid_0's multi_logloss: 0.172743
[424]	valid_0's multi_logloss: 0.172761
[425]	valid_0's multi_logloss: 0.172663
[426]	valid_0's multi_logloss: 0.172683
[427]	valid_0's multi_logloss: 0.172586
[428]	valid_0's multi_logloss: 0.172424
[429]	valid_0's multi_logloss: 0.172478
[430]	valid_0's multi_logloss: 0.172555
[431]	valid_0's multi_logloss: 0.172491
[432]	valid_0's multi_logloss: 0.172466
[433]	valid_0's multi_logloss: 0.172351
[434]	valid_0's multi_logloss: 0.172287
[435]	valid_0's multi_logloss: 0.172297
[436]	valid_0's multi_logloss: 0.172252
[4

[617]	valid_0's multi_logloss: 0.172043
[618]	valid_0's multi_logloss: 0.172018
[619]	valid_0's multi_logloss: 0.171869
[620]	valid_0's multi_logloss: 0.171843
[621]	valid_0's multi_logloss: 0.171892
[622]	valid_0's multi_logloss: 0.171812
[623]	valid_0's multi_logloss: 0.171788
[624]	valid_0's multi_logloss: 0.171775
[625]	valid_0's multi_logloss: 0.171804
[626]	valid_0's multi_logloss: 0.171817
[627]	valid_0's multi_logloss: 0.171774
[628]	valid_0's multi_logloss: 0.171753
[629]	valid_0's multi_logloss: 0.171791
[630]	valid_0's multi_logloss: 0.171819
[631]	valid_0's multi_logloss: 0.171858
[632]	valid_0's multi_logloss: 0.171906
[633]	valid_0's multi_logloss: 0.171892
[634]	valid_0's multi_logloss: 0.171931
[635]	valid_0's multi_logloss: 0.171939
[636]	valid_0's multi_logloss: 0.171954
[637]	valid_0's multi_logloss: 0.171935
[638]	valid_0's multi_logloss: 0.171874
[639]	valid_0's multi_logloss: 0.171851
[640]	valid_0's multi_logloss: 0.171788
[641]	valid_0's multi_logloss: 0.171771


[822]	valid_0's multi_logloss: 0.171803
[823]	valid_0's multi_logloss: 0.171839
[824]	valid_0's multi_logloss: 0.17181
[825]	valid_0's multi_logloss: 0.171801
[826]	valid_0's multi_logloss: 0.171822
[827]	valid_0's multi_logloss: 0.171777
[828]	valid_0's multi_logloss: 0.171785
[829]	valid_0's multi_logloss: 0.17184
[830]	valid_0's multi_logloss: 0.171816
[831]	valid_0's multi_logloss: 0.171802
[832]	valid_0's multi_logloss: 0.17173
[833]	valid_0's multi_logloss: 0.171755
[834]	valid_0's multi_logloss: 0.171745
[835]	valid_0's multi_logloss: 0.171777
[836]	valid_0's multi_logloss: 0.171709
[837]	valid_0's multi_logloss: 0.171678
[838]	valid_0's multi_logloss: 0.171672
[839]	valid_0's multi_logloss: 0.171738
[840]	valid_0's multi_logloss: 0.171709
[841]	valid_0's multi_logloss: 0.171719
[842]	valid_0's multi_logloss: 0.171784
[843]	valid_0's multi_logloss: 0.17176
[844]	valid_0's multi_logloss: 0.171808
[845]	valid_0's multi_logloss: 0.171811
[846]	valid_0's multi_logloss: 0.171838
[847

[1026]	valid_0's multi_logloss: 0.172087
[1027]	valid_0's multi_logloss: 0.172123
[1028]	valid_0's multi_logloss: 0.172112
[1029]	valid_0's multi_logloss: 0.172112
[1030]	valid_0's multi_logloss: 0.172072
[1031]	valid_0's multi_logloss: 0.172102
[1032]	valid_0's multi_logloss: 0.172157
[1033]	valid_0's multi_logloss: 0.172159
[1034]	valid_0's multi_logloss: 0.172117
[1035]	valid_0's multi_logloss: 0.172126
[1036]	valid_0's multi_logloss: 0.172081
[1037]	valid_0's multi_logloss: 0.172131
[1038]	valid_0's multi_logloss: 0.172115
[1039]	valid_0's multi_logloss: 0.172142
[1040]	valid_0's multi_logloss: 0.172192
[1041]	valid_0's multi_logloss: 0.172244
[1042]	valid_0's multi_logloss: 0.172231
[1043]	valid_0's multi_logloss: 0.172251
[1044]	valid_0's multi_logloss: 0.172241
[1045]	valid_0's multi_logloss: 0.172243
[1046]	valid_0's multi_logloss: 0.172213
[1047]	valid_0's multi_logloss: 0.172249
[1048]	valid_0's multi_logloss: 0.172208
[1049]	valid_0's multi_logloss: 0.172243
[1050]	valid_0's

[1226]	valid_0's multi_logloss: 0.172621
[1227]	valid_0's multi_logloss: 0.172651
[1228]	valid_0's multi_logloss: 0.17265
[1229]	valid_0's multi_logloss: 0.172619
[1230]	valid_0's multi_logloss: 0.172624
[1231]	valid_0's multi_logloss: 0.172628
[1232]	valid_0's multi_logloss: 0.172637
[1233]	valid_0's multi_logloss: 0.172634
[1234]	valid_0's multi_logloss: 0.172621
[1235]	valid_0's multi_logloss: 0.172621
[1236]	valid_0's multi_logloss: 0.172607
[1237]	valid_0's multi_logloss: 0.172575
[1238]	valid_0's multi_logloss: 0.172596
[1239]	valid_0's multi_logloss: 0.172631
[1240]	valid_0's multi_logloss: 0.172607
[1241]	valid_0's multi_logloss: 0.17264
[1242]	valid_0's multi_logloss: 0.17268
[1243]	valid_0's multi_logloss: 0.172735
[1244]	valid_0's multi_logloss: 0.172689
[1245]	valid_0's multi_logloss: 0.172701
[1246]	valid_0's multi_logloss: 0.172744
[1247]	valid_0's multi_logloss: 0.172747
[1248]	valid_0's multi_logloss: 0.172769
[1249]	valid_0's multi_logloss: 0.172762
[1250]	valid_0's mu

In [24]:
params = {
          "objective" : "multiclass",
          "num_class" : 2,
          "metric" : "multi_logloss",
          "num_leaves" : 256,
          "max_depth": -1,
          "learning_rate" : 0.0001,
          "bagging_fraction" : 0.3, 
          "feature_fraction" : 0.3,  
          "bagging_freq" : 5,        
          "bagging_seed" : 3,
          "pos_bagging_fraction" : 0.113,
          "neg_bagging_fraction" : 1.0,
          "boosting" : "rf", 
          "verbosity" : -1
           }

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)
#                 init_model='./model_DS1_DS2_190928.txt')

print('Saving model...')
# save model to file
gbm.save_model('./model_train_1209_3.txt')

print('Starting predicting...')
# predict

Starting training...
[1]	valid_0's multi_logloss: 0.26218
Training until validation scores don't improve for 500 rounds
[2]	valid_0's multi_logloss: 0.220487
[3]	valid_0's multi_logloss: 0.205643
[4]	valid_0's multi_logloss: 0.21371
[5]	valid_0's multi_logloss: 0.200539
[6]	valid_0's multi_logloss: 0.19988
[7]	valid_0's multi_logloss: 0.19868
[8]	valid_0's multi_logloss: 0.201902
[9]	valid_0's multi_logloss: 0.19778
[10]	valid_0's multi_logloss: 0.195914
[11]	valid_0's multi_logloss: 0.197218
[12]	valid_0's multi_logloss: 0.191133
[13]	valid_0's multi_logloss: 0.1865
[14]	valid_0's multi_logloss: 0.186118
[15]	valid_0's multi_logloss: 0.182823
[16]	valid_0's multi_logloss: 0.183304
[17]	valid_0's multi_logloss: 0.184864
[18]	valid_0's multi_logloss: 0.183462
[19]	valid_0's multi_logloss: 0.182827
[20]	valid_0's multi_logloss: 0.181172
[21]	valid_0's multi_logloss: 0.182561
[22]	valid_0's multi_logloss: 0.181703
[23]	valid_0's multi_logloss: 0.181987
[24]	valid_0's multi_logloss: 0.1847

[207]	valid_0's multi_logloss: 0.176557
[208]	valid_0's multi_logloss: 0.176334
[209]	valid_0's multi_logloss: 0.176341
[210]	valid_0's multi_logloss: 0.176504
[211]	valid_0's multi_logloss: 0.176513
[212]	valid_0's multi_logloss: 0.176287
[213]	valid_0's multi_logloss: 0.176394
[214]	valid_0's multi_logloss: 0.176661
[215]	valid_0's multi_logloss: 0.176664
[216]	valid_0's multi_logloss: 0.176537
[217]	valid_0's multi_logloss: 0.176547
[218]	valid_0's multi_logloss: 0.176417
[219]	valid_0's multi_logloss: 0.176554
[220]	valid_0's multi_logloss: 0.176574
[221]	valid_0's multi_logloss: 0.176533
[222]	valid_0's multi_logloss: 0.176501
[223]	valid_0's multi_logloss: 0.176083
[224]	valid_0's multi_logloss: 0.175943
[225]	valid_0's multi_logloss: 0.175766
[226]	valid_0's multi_logloss: 0.175555
[227]	valid_0's multi_logloss: 0.175392
[228]	valid_0's multi_logloss: 0.175385
[229]	valid_0's multi_logloss: 0.175364
[230]	valid_0's multi_logloss: 0.175212
[231]	valid_0's multi_logloss: 0.17506
[

[412]	valid_0's multi_logloss: 0.173394
[413]	valid_0's multi_logloss: 0.173467
[414]	valid_0's multi_logloss: 0.173473
[415]	valid_0's multi_logloss: 0.173288
[416]	valid_0's multi_logloss: 0.173231
[417]	valid_0's multi_logloss: 0.173301
[418]	valid_0's multi_logloss: 0.173246
[419]	valid_0's multi_logloss: 0.173119
[420]	valid_0's multi_logloss: 0.173042
[421]	valid_0's multi_logloss: 0.172995
[422]	valid_0's multi_logloss: 0.172932
[423]	valid_0's multi_logloss: 0.173055
[424]	valid_0's multi_logloss: 0.17313
[425]	valid_0's multi_logloss: 0.173063
[426]	valid_0's multi_logloss: 0.173079
[427]	valid_0's multi_logloss: 0.173015
[428]	valid_0's multi_logloss: 0.172875
[429]	valid_0's multi_logloss: 0.172906
[430]	valid_0's multi_logloss: 0.172968
[431]	valid_0's multi_logloss: 0.17289
[432]	valid_0's multi_logloss: 0.172833
[433]	valid_0's multi_logloss: 0.172726
[434]	valid_0's multi_logloss: 0.172694
[435]	valid_0's multi_logloss: 0.172689
[436]	valid_0's multi_logloss: 0.172651
[4

[617]	valid_0's multi_logloss: 0.172682
[618]	valid_0's multi_logloss: 0.172698
[619]	valid_0's multi_logloss: 0.172574
[620]	valid_0's multi_logloss: 0.172555
[621]	valid_0's multi_logloss: 0.172581
[622]	valid_0's multi_logloss: 0.172467
[623]	valid_0's multi_logloss: 0.172496
[624]	valid_0's multi_logloss: 0.172462
[625]	valid_0's multi_logloss: 0.172467
[626]	valid_0's multi_logloss: 0.172489
[627]	valid_0's multi_logloss: 0.172463
[628]	valid_0's multi_logloss: 0.172461
[629]	valid_0's multi_logloss: 0.172504
[630]	valid_0's multi_logloss: 0.172528
[631]	valid_0's multi_logloss: 0.172545
[632]	valid_0's multi_logloss: 0.172567
[633]	valid_0's multi_logloss: 0.172539
[634]	valid_0's multi_logloss: 0.172574
[635]	valid_0's multi_logloss: 0.172602
[636]	valid_0's multi_logloss: 0.172633
[637]	valid_0's multi_logloss: 0.172593
[638]	valid_0's multi_logloss: 0.172511
[639]	valid_0's multi_logloss: 0.172491
[640]	valid_0's multi_logloss: 0.172426
[641]	valid_0's multi_logloss: 0.172365


[822]	valid_0's multi_logloss: 0.17201
[823]	valid_0's multi_logloss: 0.172029
[824]	valid_0's multi_logloss: 0.171977
[825]	valid_0's multi_logloss: 0.171969
[826]	valid_0's multi_logloss: 0.172
[827]	valid_0's multi_logloss: 0.171935
[828]	valid_0's multi_logloss: 0.171944
[829]	valid_0's multi_logloss: 0.172011
[830]	valid_0's multi_logloss: 0.171985
[831]	valid_0's multi_logloss: 0.171978
[832]	valid_0's multi_logloss: 0.171894
[833]	valid_0's multi_logloss: 0.171918
[834]	valid_0's multi_logloss: 0.171924
[835]	valid_0's multi_logloss: 0.171953
[836]	valid_0's multi_logloss: 0.171896
[837]	valid_0's multi_logloss: 0.171901
[838]	valid_0's multi_logloss: 0.171884
[839]	valid_0's multi_logloss: 0.171967
[840]	valid_0's multi_logloss: 0.171933
[841]	valid_0's multi_logloss: 0.171929
[842]	valid_0's multi_logloss: 0.171984
[843]	valid_0's multi_logloss: 0.171959
[844]	valid_0's multi_logloss: 0.172002
[845]	valid_0's multi_logloss: 0.172
[846]	valid_0's multi_logloss: 0.17203
[847]	va

[1026]	valid_0's multi_logloss: 0.172325
[1027]	valid_0's multi_logloss: 0.172339
[1028]	valid_0's multi_logloss: 0.172332
[1029]	valid_0's multi_logloss: 0.172352
[1030]	valid_0's multi_logloss: 0.172288
[1031]	valid_0's multi_logloss: 0.172308
[1032]	valid_0's multi_logloss: 0.172346
[1033]	valid_0's multi_logloss: 0.172326
[1034]	valid_0's multi_logloss: 0.172268
[1035]	valid_0's multi_logloss: 0.172255
[1036]	valid_0's multi_logloss: 0.172205
[1037]	valid_0's multi_logloss: 0.172244
[1038]	valid_0's multi_logloss: 0.172234
[1039]	valid_0's multi_logloss: 0.172239
[1040]	valid_0's multi_logloss: 0.172274
[1041]	valid_0's multi_logloss: 0.172329
[1042]	valid_0's multi_logloss: 0.172335
[1043]	valid_0's multi_logloss: 0.172307
[1044]	valid_0's multi_logloss: 0.172293
[1045]	valid_0's multi_logloss: 0.172299
[1046]	valid_0's multi_logloss: 0.172277
[1047]	valid_0's multi_logloss: 0.17232
[1048]	valid_0's multi_logloss: 0.172272
[1049]	valid_0's multi_logloss: 0.172315
[1050]	valid_0's 

[1226]	valid_0's multi_logloss: 0.17271
[1227]	valid_0's multi_logloss: 0.172723
[1228]	valid_0's multi_logloss: 0.172737
[1229]	valid_0's multi_logloss: 0.172713
[1230]	valid_0's multi_logloss: 0.172721
[1231]	valid_0's multi_logloss: 0.172734
[1232]	valid_0's multi_logloss: 0.172743
[1233]	valid_0's multi_logloss: 0.172726
[1234]	valid_0's multi_logloss: 0.172745
[1235]	valid_0's multi_logloss: 0.172754
[1236]	valid_0's multi_logloss: 0.172743
[1237]	valid_0's multi_logloss: 0.172715
[1238]	valid_0's multi_logloss: 0.172732
[1239]	valid_0's multi_logloss: 0.172756
[1240]	valid_0's multi_logloss: 0.172732
[1241]	valid_0's multi_logloss: 0.172767
[1242]	valid_0's multi_logloss: 0.172816
[1243]	valid_0's multi_logloss: 0.172875
[1244]	valid_0's multi_logloss: 0.172857
[1245]	valid_0's multi_logloss: 0.17287
[1246]	valid_0's multi_logloss: 0.172894
[1247]	valid_0's multi_logloss: 0.172897
[1248]	valid_0's multi_logloss: 0.17292
[1249]	valid_0's multi_logloss: 0.172916
[1250]	valid_0's mu

In [25]:
params = {
          "objective" : "multiclass",
          "num_class" : 2,
          "metric" : "multi_logloss",
          "num_leaves" : 256,
          "max_depth": -1,
          "learning_rate" : 0.0001,
          "bagging_fraction" : 0.3, 
          "feature_fraction" : 0.3,  
          "bagging_freq" : 5,        
          "bagging_seed" : 4,
          "pos_bagging_fraction" : 0.113,
          "neg_bagging_fraction" : 1.0,
          "boosting" : "rf", 
          "verbosity" : -1
           }

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=100000,
                valid_sets=lgb_eval,
                early_stopping_rounds=500)
#                 init_model='./model_DS1_DS2_190928.txt')

print('Saving model...')
# save model to file
gbm.save_model('./model_train_1209_4.txt')

print('Starting predicting...')
# predict

Starting training...
[1]	valid_0's multi_logloss: 0.265673
Training until validation scores don't improve for 500 rounds
[2]	valid_0's multi_logloss: 0.208609
[3]	valid_0's multi_logloss: 0.195668
[4]	valid_0's multi_logloss: 0.196706
[5]	valid_0's multi_logloss: 0.186286
[6]	valid_0's multi_logloss: 0.188376
[7]	valid_0's multi_logloss: 0.186759
[8]	valid_0's multi_logloss: 0.191673
[9]	valid_0's multi_logloss: 0.188031
[10]	valid_0's multi_logloss: 0.187918
[11]	valid_0's multi_logloss: 0.189854
[12]	valid_0's multi_logloss: 0.184737
[13]	valid_0's multi_logloss: 0.181804
[14]	valid_0's multi_logloss: 0.1815
[15]	valid_0's multi_logloss: 0.179918
[16]	valid_0's multi_logloss: 0.179108
[17]	valid_0's multi_logloss: 0.181108
[18]	valid_0's multi_logloss: 0.177535
[19]	valid_0's multi_logloss: 0.176917
[20]	valid_0's multi_logloss: 0.175336
[21]	valid_0's multi_logloss: 0.175359
[22]	valid_0's multi_logloss: 0.174802
[23]	valid_0's multi_logloss: 0.175172
[24]	valid_0's multi_logloss: 0

[206]	valid_0's multi_logloss: 0.175024
[207]	valid_0's multi_logloss: 0.174937
[208]	valid_0's multi_logloss: 0.17468
[209]	valid_0's multi_logloss: 0.174652
[210]	valid_0's multi_logloss: 0.174749
[211]	valid_0's multi_logloss: 0.174767
[212]	valid_0's multi_logloss: 0.174586
[213]	valid_0's multi_logloss: 0.174634
[214]	valid_0's multi_logloss: 0.174882
[215]	valid_0's multi_logloss: 0.174932
[216]	valid_0's multi_logloss: 0.174823
[217]	valid_0's multi_logloss: 0.174805
[218]	valid_0's multi_logloss: 0.174692
[219]	valid_0's multi_logloss: 0.174715
[220]	valid_0's multi_logloss: 0.174761
[221]	valid_0's multi_logloss: 0.174717
[222]	valid_0's multi_logloss: 0.174689
[223]	valid_0's multi_logloss: 0.174204
[224]	valid_0's multi_logloss: 0.174071
[225]	valid_0's multi_logloss: 0.173892
[226]	valid_0's multi_logloss: 0.173693
[227]	valid_0's multi_logloss: 0.173543
[228]	valid_0's multi_logloss: 0.173501
[229]	valid_0's multi_logloss: 0.173505
[230]	valid_0's multi_logloss: 0.173426
[

[411]	valid_0's multi_logloss: 0.172818
[412]	valid_0's multi_logloss: 0.172765
[413]	valid_0's multi_logloss: 0.172801
[414]	valid_0's multi_logloss: 0.172811
[415]	valid_0's multi_logloss: 0.172628
[416]	valid_0's multi_logloss: 0.172547
[417]	valid_0's multi_logloss: 0.17263
[418]	valid_0's multi_logloss: 0.172583
[419]	valid_0's multi_logloss: 0.172476
[420]	valid_0's multi_logloss: 0.172377
[421]	valid_0's multi_logloss: 0.172322
[422]	valid_0's multi_logloss: 0.172333
[423]	valid_0's multi_logloss: 0.172482
[424]	valid_0's multi_logloss: 0.172489
[425]	valid_0's multi_logloss: 0.172434
[426]	valid_0's multi_logloss: 0.172448
[427]	valid_0's multi_logloss: 0.172405
[428]	valid_0's multi_logloss: 0.172278
[429]	valid_0's multi_logloss: 0.17232
[430]	valid_0's multi_logloss: 0.172408
[431]	valid_0's multi_logloss: 0.172323
[432]	valid_0's multi_logloss: 0.172355
[433]	valid_0's multi_logloss: 0.172233
[434]	valid_0's multi_logloss: 0.172212
[435]	valid_0's multi_logloss: 0.172206
[4

[616]	valid_0's multi_logloss: 0.172601
[617]	valid_0's multi_logloss: 0.172618
[618]	valid_0's multi_logloss: 0.172596
[619]	valid_0's multi_logloss: 0.172419
[620]	valid_0's multi_logloss: 0.172355
[621]	valid_0's multi_logloss: 0.172384
[622]	valid_0's multi_logloss: 0.172285
[623]	valid_0's multi_logloss: 0.172302
[624]	valid_0's multi_logloss: 0.172254
[625]	valid_0's multi_logloss: 0.172253
[626]	valid_0's multi_logloss: 0.172261
[627]	valid_0's multi_logloss: 0.172209
[628]	valid_0's multi_logloss: 0.172182
[629]	valid_0's multi_logloss: 0.172198
[630]	valid_0's multi_logloss: 0.172204
[631]	valid_0's multi_logloss: 0.172244
[632]	valid_0's multi_logloss: 0.172247
[633]	valid_0's multi_logloss: 0.172247
[634]	valid_0's multi_logloss: 0.172292
[635]	valid_0's multi_logloss: 0.172268
[636]	valid_0's multi_logloss: 0.172279
[637]	valid_0's multi_logloss: 0.172283
[638]	valid_0's multi_logloss: 0.172223
[639]	valid_0's multi_logloss: 0.172215
[640]	valid_0's multi_logloss: 0.172167


[821]	valid_0's multi_logloss: 0.172023
[822]	valid_0's multi_logloss: 0.172001
[823]	valid_0's multi_logloss: 0.172008
[824]	valid_0's multi_logloss: 0.171976
[825]	valid_0's multi_logloss: 0.171964
[826]	valid_0's multi_logloss: 0.171957
[827]	valid_0's multi_logloss: 0.171894
[828]	valid_0's multi_logloss: 0.171898
[829]	valid_0's multi_logloss: 0.171939
[830]	valid_0's multi_logloss: 0.171919
[831]	valid_0's multi_logloss: 0.171903
[832]	valid_0's multi_logloss: 0.171804
[833]	valid_0's multi_logloss: 0.171843
[834]	valid_0's multi_logloss: 0.171832
[835]	valid_0's multi_logloss: 0.171865
[836]	valid_0's multi_logloss: 0.171825
[837]	valid_0's multi_logloss: 0.171802
[838]	valid_0's multi_logloss: 0.171795
[839]	valid_0's multi_logloss: 0.17185
[840]	valid_0's multi_logloss: 0.171834
[841]	valid_0's multi_logloss: 0.171837
[842]	valid_0's multi_logloss: 0.171913
[843]	valid_0's multi_logloss: 0.171896
[844]	valid_0's multi_logloss: 0.171946
[845]	valid_0's multi_logloss: 0.171954
[

[1025]	valid_0's multi_logloss: 0.172497
[1026]	valid_0's multi_logloss: 0.172523
[1027]	valid_0's multi_logloss: 0.172541
[1028]	valid_0's multi_logloss: 0.172527
[1029]	valid_0's multi_logloss: 0.172558
[1030]	valid_0's multi_logloss: 0.172489
[1031]	valid_0's multi_logloss: 0.172503
[1032]	valid_0's multi_logloss: 0.17253
[1033]	valid_0's multi_logloss: 0.172538
[1034]	valid_0's multi_logloss: 0.172491
[1035]	valid_0's multi_logloss: 0.172501
[1036]	valid_0's multi_logloss: 0.172444
[1037]	valid_0's multi_logloss: 0.172485
[1038]	valid_0's multi_logloss: 0.172456
[1039]	valid_0's multi_logloss: 0.172488
[1040]	valid_0's multi_logloss: 0.172536
[1041]	valid_0's multi_logloss: 0.172566
[1042]	valid_0's multi_logloss: 0.172552
[1043]	valid_0's multi_logloss: 0.172557
[1044]	valid_0's multi_logloss: 0.172537
[1045]	valid_0's multi_logloss: 0.172526
[1046]	valid_0's multi_logloss: 0.172494
[1047]	valid_0's multi_logloss: 0.172528
[1048]	valid_0's multi_logloss: 0.172496
[1049]	valid_0's 

In [26]:
gbm = lightgbm.Booster(model_file='./model_train_1209_1.txt')
y_pred1 = gbm.predict(X_test)
gbm = lightgbm.Booster(model_file='./model_train_1209_2.txt')
y_pred2 = gbm.predict(X_test)
gbm = lightgbm.Booster(model_file='./model_train_1209_3.txt')
y_pred3 = gbm.predict(X_test)
gbm = lightgbm.Booster(model_file='./model_train_1209_4.txt')
y_pred4 = gbm.predict(X_test)
# eval

In [27]:
new_ypred = []
for item in y_pred1 : 
    if(np.argmax(item) == 1) :
        new_ypred.append(1)
    else : 
        new_ypred.append(0)
        
print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

[[ 4581  1074]
 [ 1436 42489]]
94.93747478822105
acc 94.93747478822105
recall 96.73079112122936
precision 97.5346050547483
F1 97.13103511338697


In [28]:
new_ypred = []
for item in y_pred2 : 
    if(np.argmax(item) == 1) :
        new_ypred.append(1)
    else : 
        new_ypred.append(0)
        
print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

[[ 4383  1272]
 [ 1311 42614]]
94.79023799919322
acc 94.79023799919322
recall 97.0153671030165
precision 97.10158136991296
F1 97.05845509104782


In [29]:
new_ypred = []
for item in y_pred3 : 
    if(np.argmax(item) == 1) :
        new_ypred.append(1)
    else : 
        new_ypred.append(0)
        
print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

[[ 4388  1267]
 [ 1319 42606]]
94.78418717224687
acc 94.78418717224687
recall 96.99715424018213
precision 97.1121190709548
F1 97.05460261053783


In [30]:
new_ypred = []
for item in y_pred4 : 
    if(np.argmax(item) == 1) :
        new_ypred.append(1)
    else : 
        new_ypred.append(0)
        
print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

[[ 4386  1269]
 [ 1319 42606]]
94.78015328761597
acc 94.78015328761597
recall 96.99715424018213
precision 97.1076923076923
F1 97.05239179954441


In [31]:
new_ypred = []
vote_cont=[y_pred1,y_pred2,y_pred3,y_pred4]
for i in range(len(y_pred1)) : 
    cnt_1 = 0 
    for cont in vote_cont : 
        if (np.argmax(cont[i]) == 1) :
            cnt_1 += 1 
    if (cnt_1 >= 2) : 
        new_ypred.append(1)
    else : 
         new_ypred.append(0)

In [32]:
print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

#model_train_test_0928_leave_64 94.60467930617185

[[ 4383  1272]
 [ 1285 42640]]
94.84267849939492
acc 94.84267849939492
recall 97.07455890722822
precision 97.1032975040991
F1 97.08892607898721


In [33]:
print(tp, fp, fn, tn) 

42640 1272 1285 4383


#  SVM

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# sc = StandardScaler()
# sc.fit(X_train)
# X_train_std = sc.transform(X_train)
# X_test_std = sc.transform(X_test)

ml = SVC(kernel='linear', C=1.0, random_state=0)
ml.fit(X_train, y_train2)
y_pred5 = ml.predict(X_test)
print(y_pred5)
print(confusion_matrix(y_test2,y_pred5))
tn,fp,fn,tp = confusion_matrix(y_test2,y_pred5).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

#  XGboost

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train,y_train2)
y_pred6 = model.predict(X_test)
print(y_pred6)
print(confusion_matrix(y_test2,y_pred6))
tn,fp,fn,tp = confusion_matrix(y_test2,y_pred6).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )

# Ensemble

In [None]:
new_ypred = []
vote_cont = [y_pred1,y_pred2,y_pred3,y_pred4]
for i in range(len(y_pred1)) : 
    cnt_1 = 0 
    for cont in vote_cont : 
        if (np.argmax(cont[i]) == 1) :
            cnt_1 += 1 
    if(y_pred5[i] == 1) : 
        cnt_1 += 1
    if(y_pred6[i] == 1) : 
        cnt_1 += 1
    if (cnt_1 >= (len(vote_cont) + 2) / 2) : 
        new_ypred.append(1)
    else : 
         new_ypred.append(0)

print(confusion_matrix(y_test2,new_ypred))
tn,fp,fn,tp = confusion_matrix(y_test2,new_ypred).ravel()
print( (tn+tp) / (tn+fp+fn+tp) * 100)

print('acc', (tn+tp) / (tn+fp+fn+tp) * 100)
print('recall', (tp) / (tp+fn) * 100)
print('precision', (tp) / (tp+fp) * 100)
recall = (tp) / (tp+fn) * 100
preci = (tp) / (tp+fp) * 100
print('F1', 2 * recall * preci / (recall + preci) )