In [1]:
import os
import sys
import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold

In [2]:
# ID    UTTERANCE    CORE RELATIONS
trainFile = pd.read_csv('dataset/hw1_train.csv')
# ID    UTTERANCE
testFile = pd.read_csv('dataset/hw1_test.csv')

In [3]:
# Data preparation

# input features
X = list(trainFile['UTTERANCE'])

unique_relations = []
for relation_str in trainFile['CORE RELATIONS']:
    relations = relation_str.split(' ')
    for relation in relations:
        if relation not in unique_relations:
            unique_relations.append(relation)

unique_nodes = []
for path in unique_relations:
    nodes = path.split('.')
    for node in nodes:
        if node not in unique_nodes:
            unique_nodes.append(node)

# add label columns to dataframe
for relation in unique_relations:
    trainFile[relation] = 0

# fill out label columns
for idx, relation_str in enumerate(trainFile['CORE RELATIONS']):
    relations = relation_str.split(' ')
    for relation in relations:
        trainFile.loc[idx,relation] = 1

#separate label columns
labels = trainFile[unique_relations]

# target values
y = labels.values

In [4]:
# 5000 words because the training set has almost 2500 unique words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_matrix(X)

In [5]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [14]:
def create_model(hyperParams):
    
    hidden_layers = hyperParams['hidden_layers']
    activation = hyperParams['activation']
    dropout = hyperParams['dropout']
    output_activation = hyperParams['output_activation']
    loss = hyperParams['loss']
    
    model = Sequential()
    model.add(Dense(hidden_layers[0], input_shape=(5000,), activation=activation))
    model.add(Dropout(dropout))
    for i in range(len(hidden_layers)-1):
        model.add(Dense(hidden_layers[i], activation=activation))
        model.add(Dropout(dropout))
    model.add(Dense(46, activation=output_activation))
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy', f1_score])
    # categorical_crossentropy, binary_crossentropy f1_loss->(for tensorflow 1.14)
    
    return model

In [15]:
def cv_model_fit(X, y, hyperParams):
    
    kfold = KFold(n_splits=10, shuffle=True)
    scores=[]
    for train_idx, test_idx in kfold.split(X):
        model = create_model(hyperParams)
        model.fit(X[train_idx], y[train_idx], batch_size=hyperParams['batch_size'], 
                  epochs=hyperParams['epochs'], verbose=0)
        score = model.evaluate(X[test_idx], y[test_idx], verbose=0)
        scores.append(score[2]*100) # f_score
        print('fold ', len(scores), '  score: ', scores[-1])
        del model
        
    return scores

In [18]:
# hyper parameter optimization
from itertools import product

hyperParams = {'batch_size': 32, 'epochs': 25, 'hidden_layers': [512, 512], 'activation': 'relu', 'dropout': 0.3,
              'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'}

epochs_choices = [15, 25, 40, 100]
activation_choices = ['relu', 'sigmoid', 'tanh']
hidden_layers_choices = [[512, 512], [128, 128, 128], [256, 256, 256], [512, 512, 512],
                         [256, 512, 256], [128, 256, 128], [128, 512, 128], [512, 512, 128]]

s = [epochs_choices, activation_choices, hidden_layers_choices]
perms = list(product(*s))  # permutations

best_acc = 0
for row in perms:
    hyperParams['epochs'] = row[0]
    hyperParams['activation'] = row[1]
    hyperParams['hidden_layers'] = row[2]
    print('10-fold cross validation on these hyperparameters: ', hyperParams, '\n')
    cvscores = cv_model_fit(X, y, hyperParams)
    print('\n-------------------------------------------')
    print('CV mean: {0:0.4f},  CV std: {1:0.4f}'.format(np.mean(cvscores), np.std(cvscores)))
    if cvscores[0] > best_acc:    # later I should incorporate std in best model selection
        best_acc = cvscores[0]
        print('****** Best model so far ******')
        best_params = hyperParams
    print('-------------------------------------------\n')

10-fold cross validation on these hyperparameters:  {'batch_size': 32, 'epochs': 15, 'hidden_layers': [512, 512], 'activation': 'relu', 'dropout': 0.3, 'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'} 

fold  1   score:  82.40101337432861
fold  2   score:  83.97678732872009
fold  3   score:  81.99262022972107
fold  4   score:  80.53823709487915
fold  5   score:  84.28609371185303
fold  6   score:  80.63655495643616
fold  7   score:  85.75929403305054
fold  8   score:  84.63653326034546
fold  9   score:  83.90231728553772
fold  10   score:  82.26687908172607

-------------------------------------------
CV mean: 83.0396,  CV std: 1.6509
****** Best model so far ******
-------------------------------------------

10-fold cross validation on these hyperparameters:  {'batch_size': 32, 'epochs': 15, 'hidden_layers': [128, 128, 128], 'activation': 'relu', 'dropout': 0.3, 'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'} 

fold  1   score:  82.28258490562439
fold  2 

KeyboardInterrupt: 

In [19]:
# build last model from full data
model = create_model(best_params)
model.fit(X, y, batch_size=best_params['batch_size'], epochs=best_params['epochs'], verbose=1)

Train on 3338 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f9274ae8748>

In [13]:
X_t = list(testFile['UTTERANCE'])
X_pred = tokenizer.texts_to_matrix(X_t)

In [14]:
y_pred = model.predict(X_pred)

In [15]:
predictions = []
for y in y_pred:
    temp = ''
    for i,v in enumerate(y):
        if v > 0.5:
            temp += ' ' + unique_relations[i]
    if len(temp) < 1: temp = unique_relations[np.argmax(y)]
    # if len(temp)>= 2: print(temp)
    predictions.append(temp.strip())
# predictions

In [16]:
submissionFile = pd.DataFrame({
    'ID': [i for i in range(len(predictions))],
    'CORE RELATIONS': predictions})
path_to_save = os.path.abspath(os.getcwd()) + '/predictions/'
submissionFile.to_csv('predictions/prediction12.csv', index = None)
submissionFile.head()

Unnamed: 0,ID,CORE RELATIONS
0,0,movie.starring.actor
1,1,movie.starring.actor
2,2,movie.starring.actor
3,3,movie.starring.actor
4,4,movie.starring.actor
