____________________

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%load_ext autoreload
%autoreload 2

In [27]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import  f1_score, accuracy_score
from utils import load_obj
import seaborn as sns

In [4]:
# Путь к файлу с мета данными
meta_pth = '/media/grigory/Диск/ITMO_DATA/data_v_7_stc/meta/meta.txt'

# Пути к директориям с аудиофайлами.
train_audio_pth = '/media/grigory/Диск/ITMO_DATA/data_v_7_stc/audio'
test_audio_pth = '/media/grigory/Диск/ITMO_DATA/data_v_7_stc/test'

extracted_data = 'data/extracted'

# Имена файлов с извлечёнными признаками для dense сети, линейных моделей и деревьев.
extracted_train = 'features_labels_train' 
extracted_test = 'features_labels_test'

загружаем извлечённые признаки

In [5]:
X, names, labels = load_obj(os.path.join(extracted_data, extracted_train))
X_test, names_test = load_obj(os.path.join(extracted_data, extracted_test))[:2]

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# кодируем таргет
le = LabelEncoder()
y = le.fit_transform(labels)
ohe =  OneHotEncoder()
y_ohe = ohe.fit_transform(y.reshape(-1,1))

In [7]:
n_features = X.shape[1]
n_classes = y.max()+1
print('Число классов: {}\nЧисло признаков: {}\nЧисло сэмплов: {}'.format(n_classes, n_features, y.shape[0]))

Число классов: 8
Число признаков: 193
Число сэмплов: 11307


> для правильной валидации нужно учитывать, что некоторые сэмплы являются частями одной записи.
Если части одной записи будут разрознено находится и в train и в val, то произойдёт утечка меток в валидацию.

Поэтому сперва найдём уникальные записи и сгруппируем все фрагменты по принадлежности к отдельной записи.

In [8]:
groups_le = LabelEncoder()
unique_samples = pd.Series(names).apply(lambda x: x.split('time_stretch')[0].strip('.wav').strip('_'))
groups = groups_le.fit_transform(unique_samples)

In [9]:
from sklearn.model_selection import GroupShuffleSplit
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=7)

idxs = np.arange(X.shape[0])
tr_idxs, val_idxs = next(iter(gss.split(idxs, groups=groups)))

In [10]:
# сплитим...
# имена файлов
names_train, names_val = names[tr_idxs], names[val_idxs]
# извлечённые признаки
X_train, X_val = X[tr_idxs], X[val_idxs]
# метки для sklearn
y_train, y_val = y[tr_idxs], y[val_idxs]
# метки для сеток
y_ohe_train, y_ohe_val = y_ohe[tr_idxs], y_ohe[val_idxs]

In [11]:
groups_train, groups_val = groups[tr_idxs], groups[val_idxs]

## Нелинейные модели

#### бустинг и лес

In [12]:
%%time
# !pip install lightgbm
import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier

lgbm = LGBMClassifier(learning_rate=1e-2,
#                        reg_alpha=1e-2,
#                        reg_beta=1e-1,
#                      valid_sets=[X_val, y_val],
                       random_state=7,
                     n_estimators=500)
lgbm.fit(X_train, y_train);

# Оцениваем качества на отложенной выборке
print('Scoring...')
tr_score = lgbm.score(X_train, y_train)
val_score = lgbm.score(X_val, y_val)
print('Acc:: Train score: {}, val score: {}'.format(tr_score, val_score))

Scoring...


  if diff:


Acc:: Train score: 1.0, val score: 0.9471132657558395
CPU times: user 3min 53s, sys: 728 ms, total: 3min 54s
Wall time: 1min


  if diff:


лучшие параметры  для случайного леса были найдены с помощью RandomizedSearch. Поиск занимает около 40 мин.

Параметры были записаны  в best_params, так что ячейки с перебором закомиченна

In [13]:
# # ~ 40 min
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from sklearn.ensemble import RandomForestClassifier

# estimator = RandomForestClassifier(random_state=7)
# gs_params = dict(
#     max_depth=[3,10, 15, 50, 100, 150],
#     n_estimators=[5,10, 25, 50, 100, 150, 250],
#     max_features= list(np.linspace(0.1,1,20)) + ['auto', 'sqrt'],
#     bootstrap=[True, False]
# )
# gs = RandomizedSearchCV(estimator, gs_params, n_jobs=-1, random_state=7,
#                         n_iter=100, verbose=2)#GridSearchCV(trees, gs_params)
# gs.fit(X_train, y_train, groups=groups_train);

In [16]:
# trees = gs.best_estimator_
from sklearn.ensemble import RandomForestClassifier
best_params = {'n_estimators': 100, 
               'max_features': 0.19473684210526315, 
               'max_depth': 100, 'bootstrap': False,
               'random_state':12}
trees = RandomForestClassifier(**best_params)
trees.fit(X_train, y_train)
# Оцениваем качества на отложенной выборке
print('Scoring...')
tr_score = trees.score(X_train, y_train)
val_score = trees.score(X_val, y_val)
print('Acc:: Train score: {}, val score: {}'.format(tr_score, val_score))

Scoring...
Acc:: Train score: 1.0, val score: 0.9519612163948876


## Линейные модели

In [17]:
# # 5 min
# from sklearn.linear_model import LogisticRegression
# logreg = LogisticRegression(random_state=7)
# gs_params = dict(
#     C = np.logspace(-3,3,10),
# )
# gs = RandomizedSearchCV(logreg, gs_params, n_jobs=-1, random_state=7,
#                         n_iter=10, verbose=1)#GridSearchCV(trees, gs_params)
# gs.fit(X_train, y_train, groups=groups_train);

# logreg = gs.best_estimator_
# logreg.fit(X_train, y_train)
# # Оцениваем качества на отложенной выборке
# print('Scoring...')
# tr_score = logreg.score(X_train, y_train)
# val_score = logreg.score(X_val, y_val)
# print('Acc:: Train score: {}, val score: {}'.format(tr_score, val_score))

In [20]:
from sklearn.linear_model import LogisticRegression
best_params = {'C': 1000.0}
logreg = LogisticRegression(**best_params, random_state=7)
logreg.fit(X_train, y_train)
# Оцениваем качества на отложенной выборке
print('Scoring...')
tr_score = logreg.score(X_train, y_train)
val_score = logreg.score(X_val, y_val)
print('Acc:: Train score: {}, val score: {}'.format(tr_score, val_score))

Scoring...
Acc:: Train score: 0.9470015490152689, val score: 0.8973115910092552


_________

#### онлайн-обучение нейонных сеток

Обзор методов на нейронных сетях:

- http://www.fim.uni-passau.de/fileadmin/files/lehrstuhl/schuller/Publications/Amiriparian17-SSC.pdf
- https://github.com/libphy/which_animal
- https://github.com/jaron/deep-listening
- https://musicinformationretrieval.com/mfcc.html

Тестировались:
    - полносвязанные сети
    - lstm
    - свёрточные

Самые лучшие результаты показали полносвязные сети.


*Коротко о подборе оптимальной архитектуры для Dense сети*:

<img width=350 src="https://habrastorage.org/webt/sg/7t/tu/sg7ttuirleaml3_j7dwo2tn0iqs.png">

In [21]:
import keras
from keras import backend as K
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [22]:
from models import (build_deep_dense, build_conv_seq, build_lstm_seq)
K.clear_session()

##### Dense сеть

In [23]:
model = build_deep_dense(n_features, n_classes)


model.compile(loss='categorical_crossentropy', 
              optimizer=Adam(1e-4), 
              metrics=['accuracy'])
callbacks = [
    EarlyStopping(patience=25, monitor='val_loss'),
    ModelCheckpoint('models_weights/net_ff.h5', monitor='val_loss', 
                    verbose=1, save_best_only=True, period=4),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, 
                      patience=10, verbose=1, mode='auto')
]
model.fit(X_train, y_ohe_train.todense(), 
          epochs=300, batch_size=250, 
          verbose=0,
          validation_data=[X_val, y_ohe_val.todense()],
         callbacks=callbacks)

Epoch 00004: val_loss improved from inf to 1.19687, saving model to models_weights/net_ff.h5
Epoch 00008: val_loss improved from 1.19687 to 0.67102, saving model to models_weights/net_ff.h5
Epoch 00012: val_loss improved from 0.67102 to 0.49071, saving model to models_weights/net_ff.h5
Epoch 00016: val_loss improved from 0.49071 to 0.41781, saving model to models_weights/net_ff.h5
Epoch 00020: val_loss improved from 0.41781 to 0.40695, saving model to models_weights/net_ff.h5
Epoch 00024: val_loss improved from 0.40695 to 0.37263, saving model to models_weights/net_ff.h5
Epoch 00028: val_loss improved from 0.37263 to 0.31864, saving model to models_weights/net_ff.h5
Epoch 00032: val_loss did not improve
Epoch 00036: val_loss did not improve
Epoch 00040: val_loss did not improve
Epoch 00044: val_loss did not improve
Epoch 00048: val_loss improved from 0.31864 to 0.30467, saving model to models_weights/net_ff.h5
Epoch 00052: val_loss did not improve
Epoch 00056: val_loss did not improve


<keras.callbacks.History at 0x7fc5004ab9e8>

##### Рекурретная сеть

In [24]:
# model = build_lstm_seq(timesteps=20, data_dim=41, n_classes=n_classes)
# model.compile(loss='categorical_crossentropy', 
#               optimizer=Adam(1e-4), 
#               metrics=['accuracy'])
# callbacks = [
#     EarlyStopping(patience=25, monitor='val_loss'),
#     ModelCheckpoint('models_weights/net_lstm.h5', monitor='val_loss', 
#                     verbose=1, save_best_only=True, period=4)
# ]

# model.fit(X_train, y_ohe_train.todense(), 
#           epochs=300, batch_size=250, 
#           validation_data=[X_val, y_ohe_val.todense()],
#          callbacks=callbacks)

###

In [25]:
from keras.models import load_model 
model = load_model('models_weights/net_ff.h5')

In [28]:
from utils import inverse_ohe
preds = model.predict(X_train)
preds_labels = inverse_ohe(preds, ohe)
tr_score = accuracy_score(y_train, preds_labels)
preds = model.predict(X_val)
preds_labels = inverse_ohe(preds, ohe)
te_score = accuracy_score(y_val, preds_labels)

print('Train score: {}, val score: {}'.format(tr_score, te_score))

Train score: 0.9831821199380394, val score: 0.9519612163948876


Выводы по моделям:

> Наилучшими моделями(без яростного тюнинга параметров) оказались полносвязная сеть, бустинг и случайный лес. В целом, можно сказать, что они дают  сопоставимые результаты, но если посмотреть на:
    - распределние плотности вероятности для максимальных классов по всей выборке
    - разницу точности предсказаний на трейне и валидации 
...можно легко заметить, что модели переобучились. Чтобы бороться с переобучением сделаем простой блендинг.

## Блендинг

In [35]:
from sklearn.metrics import accuracy_score


def blend(models, X, weights=None, proba=False):
    preds = []
    weights = weights or [1/len(models), ] * len(models)
    for model, weight in zip(models, weights):
        preds.append(model.predict_proba(X)* weight)
    if proba:
        return np.stack(preds).sum(axis=0)
    else:
        return np.stack(preds).sum(axis=0).argmax(axis=1)

models = [trees, model, logreg, lgbm]
weights = [0.30, 0.40, 0.25, 0.05]
tr_score = accuracy_score(y_train, blend(models, X_train, weights))
val_score = accuracy_score(y_val, blend(models, X_val, weights))
print('Acc:: Train score: {}, val score: {}'.format(tr_score, val_score))

Acc:: Train score: 0.9962381057756141, val score: 0.9638607315998237


### Предикт теста и подготовка сабмита

In [36]:
# preds = model.predict(X_test)
# preds = trees.predict(X_test)

preds = blend(models, X_test, weights=weights, proba=True)

In [37]:
def prepare_submit(names, preds, ohe=ohe, le=le):
    pred_labels = inverse_ohe(preds, ohe)
    pred_names = le.inverse_transform(pred_labels)
    df = pd.DataFrame(list(zip(names, preds.max(axis=1), pred_names)), columns=['file','prob','label'])
    return df
                      
submit_df = prepare_submit(names_test, preds)

  if diff:


In [38]:
submit_df['file_label'] = submit_df['file'].apply(lambda x: x.split('_')[0])
submit_df.groupby('file_label')['prob'].apply(np.mean)

file_label
background    0.641667
bags          0.854057
door          0.683569
keyboard      0.816699
knocking      0.809235
ring          0.899164
speech        0.778281
tool          0.781356
unknown       0.598940
Name: prob, dtype: float64

In [40]:
submit_df.to_csv('result.txt', sep='\t', header=None, index=None)