In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from scipy.stats.mstats import gmean

from matplotlib import pyplot as plt
from tqdm import tqdm

from sklearn.cross_validation import train_test_split as sk_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Masking, BatchNormalization, GlobalAveragePooling1D, GaussianDropout,\
                         TimeDistributed, GlobalMaxPooling1D, GaussianNoise, Dropout
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l1, l2
from keras.optimizers import Adam, Nadam

rnd_state = 2016
%matplotlib inline

def cross_validation(x, y, kfold, algo):
    auc = []
    scaler = StandardScaler()
    
    x_mat = x.as_matrix()
    y_mat = y.as_matrix()
    
    for train_index, test_index in kfold:
        x_train, x_test = x_mat[train_index], x_mat[test_index]
        y_train, y_test = y_mat[train_index], y_mat[test_index]
        
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        
        algo.fit(x_train, y_train)
        pred = algo.predict_proba(x_test)[:, 1]
        
        current_auc = roc_auc_score(y_test, pred)
        auc.append(current_auc)
        
        print("current auc = {}" .format(current_auc))
    
    mean_auc = np.mean(auc)
    print("mean auc = {0}" .format(mean_auc))
    
    return mean_auc
    
def plot_series(series):
    fig, ax = plt.subplots()
    plt.bar(range(len(series)), series)
    plt.xticks(range(len(series)), series.index);
    plt.show()
    
def load_data(file_path):
    print("Loading " + file_path)
    data = pd.read_csv(file_path, index_col=False)
    data.interpolate(inplace=True, method ='slinear', limit_direction='both')
    data.fillna(0.0, inplace=True)
    
    var = data.var(axis=0)
    zvar_cols = list(var[var < 1e-9].index)
    if 'class' in zvar_cols:
        zvar_cols.remove('class')
    data = data.drop(zvar_cols, axis=1)
    if zvar_cols:
        print("Columns {} haves zero variance.\n" .format(zvar_cols))
    
    return data

def train_test_split(data, test_frac):
    is_target_cls = (data['class'] == 1)
    
    test_cols_1 = data[is_target_cls]['file'].drop_duplicates()
    n_test_cols_1 = int(test_frac * len(test_cols_1))
    pos_1 = np.random.randint(0, len(test_cols_1) - n_test_cols_1)
    test_cols_1 = test_cols_1[pos_1 : pos_1 + n_test_cols_1]
        
    test_cols_0 = data[~is_target_cls]['file'].drop_duplicates()
    n_test_cols_0 = int(test_frac * len(test_cols_0))
    pos_0 = np.random.randint(0, len(test_cols_0) - n_test_cols_0)
    test_cols_0 = test_cols_0[pos_0 : pos_0 + n_test_cols_0]
        
    test_cols = test_cols_1.append(test_cols_0)
    
    is_test = data['file'].isin(test_cols)
    
    return data[~is_test], data[is_test]
    
def train_test_validation_split(data_list, test_frac, val_frac):
    train = []
    test = []
    val = []
    
    for data in data_list:
        train_data, test_data = train_test_split(data, test_frac)
        train_data, val_data = train_test_split(train_data, val_frac)
        
        train.append(train_data)
        test.append(test_data)
        val.append(val_data)
        
    return pd.concat(train, ignore_index=True), \
           pd.concat(test, ignore_index=True), \
           pd.concat(val, ignore_index=True)
    
def seq_transform(x_data, y_data, info_data, seq_len=10):    
    X = []
    Y = []
    info_idxs = []
    for i in range(0, x_data.shape[0] - seq_len + 1, seq_len):
        X.append(x_data[i : i + seq_len])
        info_idxs.append(i + seq_len - 1)
        if y_data is not None:
            Y.append([y_data[i + seq_len - 1]])
        
    return np.array(X), \
           np.array(Y), \
           info_data.ix[info_data.index[info_idxs], :]
    
def feats_target_info_split(data):
    cols = list(filter(feats_predicat, data.columns))
    
    feats = data[cols]
    target = data['class'] if 'class' in data.columns else None
    info = data[['file', 'channel']]
    
    return feats, target, info

def drop_zero(data):
    return data[data['is_zero'] == 0]

def repeat_target(x_data, y_data, x):
    is_target = (y_data == 1).flatten()
    x_list = [x_data] + [x_data[is_target]] * x
    y_list = [y_data] + [y_data[is_target]] * x

    new_x_train = np.vstack(tuple(x_list))
    new_y_train = np.vstack(tuple(y_list))
    
    return new_x_train, new_y_train

def augumentation_seq(data, n_obj):
    seq_len = data.shape[1]
    vec_len = data.shape[2]
    res_len = data.shape[0] * n_obj
    result = np.zeros((res_len, seq_len, vec_len), dtype=np.float32)
    
    pos = -1
    seq = np.zeros((seq_len, vec_len))
    for i in range(data.shape[0]):
        objs = np.random.randint(0, data.shape[0] - 1, size=n_obj)
        for j in objs:
            if i != j:
                pos += 1
                seq[: seq_len // 2, :] = data[i, : seq_len // 2, :]
                seq[seq_len // 2 :, :] = data[i, seq_len // 2 :, :]
                result[pos, :, :] = seq
                
    
    return result

def gen_seq(x_data, y_data, batch_size):
    seq_len = x_data.shape[1]
    vec_len = x_data.shape[2]
    
    is_target = (y_data == 1).flatten()
    
    batch = np.zeros((batch_size, seq_len, vec_len), dtype=np.float32)
    
    target_data = x_data[is_target, :, :]
    non_target_data = x_data[~is_target, :, :]
    while True:
        
        target1 = np.random.randint(0, target_data.shape[0] - 1, size=batch_size // 2)
        target2 = np.random.randint(0, target_data.shape[0] - 1, size=batch_size // 2)
        
        if np.random.rand() < 0.5:
            batch[:batch_size // 2, :, :] = np.concatenate((target_data[target1, : seq_len // 2, :],
                                                            target_data[target2, seq_len // 2 :, :]), axis=1)
        else:
            batch[:batch_size // 2, :, :] = target_data[target1, :, :]
        
        non_target1 = np.random.randint(0, non_target_data.shape[0] - 1, size=batch_size // 2)
        non_target2 = np.random.randint(0, non_target_data.shape[0] - 1, size=batch_size // 2)
        
        if np.random.rand() < 0.5:
            batch[batch_size // 2:, :, :] = np.concatenate((non_target_data[non_target1, : seq_len // 2, :],
                                                        non_target_data[non_target2, seq_len // 2 :, :]), axis=1)
        else:
            batch[batch_size // 2:, :, :] = non_target_data[non_target1, :, :]
        
        labels = np.vstack((np.ones((batch_size // 2, 1)), 
                            np.zeros((batch_size // 2, 1))))
        
        perm = np.random.permutation(batch_size)
        
        b_tuple = (batch[perm, :, :].copy(), labels[perm].copy())
        yield b_tuple
        
                

feats_predicat = lambda x: \
                        x == 'max_mag_freq' or \
                        x == 'pitch_salience' or \
                        x == 'flatnessSFX' or \
                        x == 'strong_peak' or \
                        x == 'zcr' or \
                        x == 'percspread' or \
                        x == 'percsharpness' or \
                        x == 'flux' or \
                        x == 'roll_off' or \
                        x == 'kurtosis' or \
                        x.startswith("freq_energy") or \
                        x == 'compl' or \
                        x == 'leq' or \
                        x.startswith("gfcc") or \
                        x.startswith("ref") or \
                        x.startswith("obsi") or \
                        x.startswith("lpc") or \
                        x == 'larm' or \
                        x == 'geo_mean' or \
                        x.startswith("lsf") or \
                        x == 'corr' or \
                        x == 'skew' or \
                        x == 'hfc' or \
                        x.startswith("obsir") or \
                        x == 'loudness' or \
                        x == 'derAvAfterMax' or \
                        x.startswith("distr") or \
                        x == 'min_to_total' or \
                        x == 'maxDerBeforeMax' or \
                        x == 'max_to_total' or \
                        x == 'specflat' or \
                        x == 'specslope' or \
                        x.startswith("moment") or \
                        x == 'entropy' or \
                        x == 'rms' or \
                        x == 'spec_cent' or \
                        x.startswith('channel')
                        #x.startswith('order')# or \
                        #x.startswith('log_freq')
                        

Using Theano backend.
Using gpu device 3: Tesla M2075 (CNMeM is enabled with initial size: 80.0% of memory, cuDNN not available)


In [2]:
data_path_1 = r"../data/feats_1.csv"
data_path_2 = r"../data/feats_2.csv"
data_path_3 = r"../data/feats_3.csv"

data_1 = load_data(data_path_1)
data_2 = load_data(data_path_2)
data_3 = load_data(data_path_3)

print("data_1 shape = {}" .format(data_1.shape))
print("data_2 shape = {}" .format(data_2.shape))
print("data_3 shape = {}" .format(data_3.shape))

Loading ../data/feats_1.csv
Columns ['specslope'] haves zero variance.

Loading ../data/feats_2.csv
Columns ['specslope'] haves zero variance.

Loading ../data/feats_3.csv
Columns ['specslope'] haves zero variance.

data_1 shape = (114768, 169)
data_2 shape = (291888, 169)
data_3 shape = (310752, 169)


## Neural network

In [3]:
test_frac = 0.1
val_frac = 0.1

data_list = [data_1, data_2, data_3]

categorical_cols = ['channel']

train, test, val = train_test_validation_split(data_list, test_frac, val_frac)
x_train, y_train, info_train = feats_target_info_split(train)
x_test, y_test, info_test = feats_target_info_split(test)
x_val, y_val, info_val = feats_target_info_split(val)

x_train_categor = pd.get_dummies(x_train[categorical_cols], columns=categorical_cols, drop_first=True)
x_test_categor = pd.get_dummies(x_test[categorical_cols], columns=categorical_cols, drop_first=True)
x_val_categor = pd.get_dummies(x_val[categorical_cols], columns=categorical_cols, drop_first=True)

cols = list(filter(lambda x: x not in categorical_cols, x_train.columns))

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train[cols])
x_test = scaler.transform(x_test[cols])
x_val = scaler.transform(x_val[cols])

x_train = np.hstack([x_train, x_train_categor.as_matrix()])
x_test = np.hstack([x_test, x_test_categor.as_matrix()])
x_val = np.hstack([x_val, x_val_categor.as_matrix()])

#x_train, y_train = repeat_target(x_train, y_train.reshape(-1, 1), 2)

print("train shape = {}" .format(x_train.shape))
print("test shape = {}" .format(x_test.shape))
print("val shape = {}" .format(x_val.shape))

train shape = (582048, 177)
test shape = (71280, 177)
val shape = (64080, 177)


In [None]:
plt.hist(y_train)
plt.show()

In [None]:
activation = 'tanh'

net = Sequential()
net.add(Dense(128, input_dim=x_train.shape[1], init='he_normal', activation=activation))
#net.add(Dropout(0.2))
net.add(Dense(64, init='he_normal', activation=activation))
#net.add(Dropout(0.2))
net.add(Dense(32, init='he_normal', activation=activation))
#net.add(Dropout(0.2))
net.add(Dense(1, input_dim=x_train.shape[1], init='he_normal', activation='sigmoid'))
opt = Adam()
net.compile(loss='binary_crossentropy', optimizer=opt)

checkpoint = ModelCheckpoint('../net/best_net.hdf5', 
                              monitor='val_loss', 
                              verbose=0, 
                              save_best_only=True, 
                              mode='auto')

In [None]:
net.load_weights("../net/best_net.hdf5")

In [None]:
net.fit(x_train, y_train, 
        validation_data=(x_val, y_val), 
        shuffle=True,
        callbacks=[checkpoint],
        batch_size=256, 
        nb_epoch=10000)

Train on 582048 samples, validate on 64080 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
105472/582048 [====>.........................] - ETA: 5s - loss: 0.3350

In [None]:
score = net.evaluate(x_test, y_test)
print("\nloss = {}" .format(score))

pred = net.predict(x_test)
current_auc = roc_auc_score(y_test, pred)
print("auc = {}" .format(current_auc))

In [None]:
plt.hist(pred[y_test.reshape(-1, 1) == 1])
plt.show()
plt.hist(pred[y_test.reshape(-1, 1) == 0])
plt.show()

In [None]:
weights_file = "../net/net1.hdf5"
net.save_weights(weights_file, overwrite=True)

## RNN

In [None]:
test_frac = 0.1
val_frac = 0.2

data_list = [data_1, data_2, data_3]

train, test, val = train_test_validation_split(data_list, test_frac, val_frac)
x_train, y_train, info_train = feats_target_info_split(train)
x_test, y_test, info_test = feats_target_info_split(test)
x_val, y_val, info_val = feats_target_info_split(val)

categorical_cols = ['channel']
'''x_train_categor = pd.get_dummies(x_train[categorical_cols], columns=categorical_cols, drop_first=True)
x_test_categor = pd.get_dummies(x_test[categorical_cols], columns=categorical_cols, drop_first=True)
x_val_categor = pd.get_dummies(x_val[categorical_cols], columns=categorical_cols, drop_first=True)'''

cols = list(filter(lambda x: x not in categorical_cols, x_train.columns))

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train[cols])
x_test = scaler.transform(x_test[cols])
x_val = scaler.transform(x_val[cols])

'''x_train = np.hstack([x_train, x_train_categor.as_matrix()])
x_test = np.hstack([x_test, x_test_categor.as_matrix()])
x_val = np.hstack([x_val, x_val_categor.as_matrix()])'''

x_train, y_train, info_train = seq_transform(x_train, y_train, info_train)
x_test, y_test, info_test = seq_transform(x_test, y_test, info_test)
x_val, y_val, info_val = seq_transform(x_val, y_val, info_val)

print("train shape = {}" .format(x_train.shape))
print("test shape = {}" .format(x_test.shape))
print("val shape = {}" .format(x_val.shape))

In [None]:
is_target = (y_train == 1).flatten()

aug_x_train_target = augumentation_seq(x_train[is_target, :, :], 3)
aug_y_train_target = np.ones((aug_x_train_target.shape[0], 1), dtype=np.float32)

aug_x_train = augumentation_seq(x_train[~is_target, :, :], 0)
aug_y_train = np.zeros((aug_x_train.shape[0], 1), dtype=np.float32)


x_train = np.vstack((x_train, aug_x_train_target, aug_x_train))
y_train = np.vstack((y_train, aug_y_train_target, aug_y_train))

In [None]:
plt.hist(y_train)
plt.show()

In [None]:
activation = 'tanh'
inner_activation = 'tanh'
initialize = 'he_normal'

net = Sequential()
#net.add(TimeDistributed(GaussianNoise(0.3), input_shape=(x_train.shape[1], x_train.shape[2])))
#net.add(TimeDistributed(Dense(512, init=initialize, activation=activation), input_shape=(x_train.shape[1], x_train.shape[2])))
#net.add(TimeDistributed(Dropout(0.1)))
net.add(TimeDistributed(Dense(256, init=initialize, activation=activation), input_shape=(x_train.shape[1], x_train.shape[2])))
net.add(TimeDistributed(Dropout(0.1)))
net.add(LSTM(128, return_sequences=True,
             input_shape=(x_train.shape[1], x_train.shape[2]), 
             init=initialize, activation=activation, dropout_W=0.1, dropout_U=0.1, 
             inner_activation=inner_activation))
net.add(GlobalAveragePooling1D())
net.add(Dense(1, init=initialize, activation='sigmoid'))
net.compile(loss='binary_crossentropy', optimizer='nadam')

checkpoint = ModelCheckpoint('../net/best_net_rnn.hdf5', 
                          monitor='val_loss', 
                          verbose=0, 
                          save_best_only=True, 
                          mode='auto')

In [None]:
net.load_weights("../net/best_net_rnn.hdf5")

In [None]:
net.fit(x_train, y_train, 
        validation_data=(x_val, y_val), 
        shuffle=True,
        callbacks=[checkpoint],
        batch_size=256, 
        nb_epoch=1000)

In [None]:
batch_size = 2048
generator = gen_seq(x_train, y_train, batch_size)

net.fit_generator(generator, validation_data=(x_val, y_val), 
                  samples_per_epoch=50 * batch_size, 
                  nb_epoch=1000, 
                  callbacks=[checkpoint])

In [None]:
score = net.evaluate(x_test, y_test)
print("\nloss = {}" .format(score))

pred = net.predict(x_test)
current_auc = roc_auc_score(y_test, pred)
print("auc = {}" .format(current_auc))

In [None]:
plt.hist(pred[y_test == 1])
plt.show()
plt.hist(pred[y_test == 0])
plt.show()

In [None]:
weights_file = "./rnn.hdf5"
net.save_weights(weights_file, overwrite=True)

# Random forest 2

In [None]:
test_frac = 0.1
val_frac = 0.2

data_list = [data_1, data_2, data_3]

train, test, val = train_test_validation_split(data_list, test_frac, val_frac)
x_train, y_train, info_train = feats_target_info_split(train)
x_test, y_test, info_test = feats_target_info_split(test)
x_val, y_val, info_val = feats_target_info_split(val)

categorical_cols = ['channel']
x_train_categor = pd.get_dummies(x_train[categorical_cols], columns=categorical_cols, drop_first=True)
x_test_categor = pd.get_dummies(x_test[categorical_cols], columns=categorical_cols, drop_first=True)
x_val_categor = pd.get_dummies(x_val[categorical_cols], columns=categorical_cols, drop_first=True)

cols = list(filter(lambda x: x not in categorical_cols, x_train.columns))

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train[cols])
x_test = scaler.transform(x_test[cols])
x_val = scaler.transform(x_val[cols])

x_train = np.hstack([x_train, x_train_categor.as_matrix()])
x_test = np.hstack([x_test, x_test_categor.as_matrix()])
x_val = np.hstack([x_val, x_val_categor.as_matrix()])

#x_train, y_train = repeat_target(x_train, y_train.reshape(-1, 1), 4)

print("train shape = {}" .format(x_train.shape))
print("test shape = {}" .format(x_test.shape))
print("val shape = {}" .format(x_val.shape))

In [None]:
xgb_gb = xgb.XGBClassifier(n_estimators=100,
                           max_depth=4,
                           learning_rate=0.3,
                           objective='binary:logistic',
                           max_delta_step=1,
                           silent=False,
                           seed=rnd_state,
                           subsample=0.7,
                           colsample_bytree=0.7,
                           min_child_weight=9,
                           scale_pos_weight=0.75,
                           reg_lambda=0,
                           nthread=12)

xgb_gb.fit(x_train, y_train.ravel())

pred = xgb_gb.predict_proba(x_val)[:, 1]
current_auc = roc_auc_score(y_val, pred)
print("auc = {}" .format(current_auc))

pred = xgb_gb.predict_proba(x_test)[:, 1]
current_auc = roc_auc_score(y_test, pred)
print("auc = {}" .format(current_auc))

In [None]:
%matplotlib inline

plt.hist(pred.reshape(-1, 1)[y_test.reshape(-1, 1) == 1])
plt.show()
plt.hist(pred.reshape(-1, 1)[y_test.reshape(-1, 1) == 0])
plt.show()

In [None]:
%matplotlib notebook
xgb.plot_importance(xgb_gb.booster())
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=100,
                           criterion='entropy',
                           max_depth=8,
                           min_samples_split=10,
                           min_samples_leaf=5,
                           n_jobs=12,
                            oob_score=True,
                           random_state=rnd_state,
                           verbose=0)
    
rf.fit(x_train, y_train.ravel())

pred = rf.predict_proba(x_val)[:, 1]
current_auc = roc_auc_score(y_val, pred)
print("auc = {}" .format(current_auc))

pred = rf.predict_proba(x_test)[:, 1]
current_auc = roc_auc_score(y_test, pred)
print("auc = {}" .format(current_auc))

kfold = KFold(n=x_train.shape[0],
              n_folds=5,
              shuffle=True,
              random_state=rnd_state)

#auc_score = cross_validation(x_train, y_train, kfold, rf)

In [None]:
x_train, y_train, info_train = feats_target_info_split(data)

kfold = KFold(n=x_train.shape[0],
              n_folds=5,
              shuffle=True,
              random_state=rnd_state)

rf = RandomForestClassifier(n_estimators=100,
                           criterion='entropy',
                           max_depth=18,
                           min_samples_split=2,
                           min_samples_leaf=6,
                           n_jobs=12,
                           random_state=rnd_state)

auc_score = cross_validation(x_train, y_train, kfold, rf)

plot_series(pd.Series(rf.feature_importances_, index=x_train.columns))

# Предсказание

In [None]:
test_data_path_1 = r"../data/test_feats_1.csv"
test_data_path_2 = r"../data/test_feats_2.csv"
test_data_path_3 = r"../data/test_feats_3.csv"

test_data_1 = load_data(test_data_path_1)
test_data_2 = load_data(test_data_path_2)
test_data_3 = load_data(test_data_path_3)

print("data_1 shape = {}" .format(test_data_1.shape))
print("data_2 shape = {}" .format(test_data_2.shape))
print("data_3 shape = {}" .format(test_data_3.shape))

In [None]:
test_data_1['id'] = 1
test_data_2['id'] = 2
test_data_3['id'] = 3

test_data = test_data_1.append(test_data_2).append(test_data_3)

uniq_files = np.unique(test_data['file'].as_matrix())
file = test_data['file'].as_matrix().reshape((-1, 1))

test_feats = feats_target_info_split(test_data)[0]
test_feats_categor = pd.get_dummies(test_feats[categorical_cols], columns=categorical_cols, drop_first=True)
test_feats = scaler.transform(test_feats[cols])
test_feats = np.hstack([test_feats, test_feats_categor.as_matrix()])

print("feats shape = {}" .format(test_feats.shape))

In [None]:
test_data_1['id'] = 1
test_data_2['id'] = 2
test_data_3['id'] = 3
test_data = pd.concat([test_data_1, test_data_2, test_data_3], ignore_index=True)

test_feats, test_target, test_info = feats_target_info_split(test_data)
#test_feats_categor = pd.get_dummies(test_feats[categorical_cols], columns=categorical_cols, drop_first=True)
test_feats = scaler.transform(test_feats[cols])
#test_feats = np.hstack([test_feats, test_feats_categor.as_matrix()])
test_feats, _, test_info = seq_transform(test_feats, test_target, test_info)

file = test_info['file'].as_matrix().reshape((-1, 1))
uniq_files = np.unique(file)

print("feats shape = {}" .format(test_feats.shape))

In [None]:
#result = net.predict(test_feats)
result = xgb_gb.predict_proba(test_feats)[:, 1].reshape(-1, 1)

In [None]:
plt.hist(result)
plt.show()

In [None]:
n_files = len(uniq_files)
result_mat = np.zeros((n_files, 2), dtype=np.object)

for i in tqdm(range(n_files)):
    result_mat[i, 0] = uniq_files[i]
    frames = result[file == uniq_files[i]]
    if frames.shape[0] == 0:
        result_mat[i, 1] = 0.0
    else:
        result_mat[i, 1] = gmean(frames)
    
res_data = pd.DataFrame(result_mat, columns=['File', 'Class'])
res_data.to_csv('../result.csv', index=False)

In [None]:
res_data['Class'].hist()
plt.show()