# Turkey Sound Binary Classifier
Available at: https://www.kaggle.com/c/dont-call-me-turkey/

In [83]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Bidirectional, LSTM, BatchNormalization, Dropout
from keras.callbacks import EarlyStopping
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [84]:
df = pd.read_json("train.json")
df.head()

Unnamed: 0,audio_embedding,end_time_seconds_youtube_clip,is_turkey,start_time_seconds_youtube_clip,vid_id
0,"[[172, 34, 216, 110, 208, 46, 95, 66, 161, 125...",70,0,60,kDCk3hLIVXo
1,"[[169, 20, 165, 102, 205, 62, 110, 103, 211, 1...",40,1,30,DPcGzqHoo7Y
2,"[[148, 8, 138, 60, 237, 48, 121, 108, 145, 177...",240,1,230,7yM63MTHh5k
3,"[[151, 0, 162, 88, 171, 71, 47, 90, 179, 190, ...",520,1,510,luG3RmUAxxM
4,"[[162, 17, 187, 111, 211, 105, 92, 67, 203, 15...",10,0,0,PIm3cjxTpOk


In [91]:
def load_dataset(path):
    df = pd.read_json(path)
    y = df.is_turkey.values
    X = embedder(df)
    return X,y

def embedder(df):
    embeddings = []
    for embedding in df['audio_embedding']:
        embedding = np.asarray(embedding)
        l = embedding.shape[0]
        not_ten_sec = (l != 10)
        if not_ten_sec:
            embedding = np.pad(embedding,((0,10-l),(0,0)),mode = 'constant')  
        embeddings.append(embedding)
    return np.asarray(embeddings)

X,y = load_dataset('train.json')
val = pd.read_json('test.json')
X_val = embedder(val)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [92]:
#Create Keras Model
model = Sequential()
model.add(BatchNormalization(input_shape=(10, 128)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, activation='relu')))
model.add(Dense(1, activation='sigmoid'))

In [93]:
early_stop = EarlyStopping(monitor='val_loss', 
                           min_delta=0.0001, 
                           patience=5, 
                           mode='min', 
                           verbose=1)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          batch_size=300,
          epochs=100,validation_data=(X_test, y_test),callbacks=[early_stop])

Train on 956 samples, validate on 239 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping


<keras.callbacks.History at 0x12b2dfc18>

In [95]:
preds = model.predict_classes(X_val)
val['preds'] = preds
val.head()

Unnamed: 0,audio_embedding,end_time_seconds_youtube_clip,start_time_seconds_youtube_clip,vid_id,preds
0,"[[177, 20, 226, 132, 198, 81, 111, 59, 132, 18...",10,0,pyKh38FXD3E,0
1,"[[169, 21, 204, 161, 195, 72, 60, 39, 152, 184...",40,30,THhP1idrWXA,0
2,"[[165, 13, 198, 141, 199, 81, 173, 54, 119, 11...",40,30,jsw3T6GY2Nw,0
3,"[[167, 18, 188, 159, 198, 63, 156, 36, 179, 22...",24,14,nFkXTMHcjMU,0
4,"[[178, 32, 181, 100, 198, 46, 82, 83, 136, 227...",40,30,Au8g9kAlrLQ,1


In [97]:
sub = val[['vid_id','preds']].copy()
sub.set_index('vid_id',inplace = True)

In [99]:
sub.to_csv('TurkeySub.csv')

In [8]:
youtube_string = 'https://www.youtube.com/watch?v={}'

'https://www.youtube.com/watch?v=kDCk3hLIVXo'