In [1]:
import os
import sys
import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from tensorflow.keras.layers import MaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold

  from ._conv import register_converters as _register_converters


In [2]:
# ID    UTTERANCE    CORE RELATIONS
trainFile = pd.read_csv('dataset/hw1_train.csv')
# ID    UTTERANCE
testFile = pd.read_csv('dataset/hw1_test.csv')

In [3]:
# Data preparation

# input features
X = list(trainFile['UTTERANCE'])

unique_relations = []
for relation_str in trainFile['CORE RELATIONS']:
    relations = relation_str.split(' ')
    for relation in relations:
        if relation not in unique_relations:
            unique_relations.append(relation)

unique_nodes = []
for path in unique_relations:
    nodes = path.split('.')
    for node in nodes:
        if node not in unique_nodes:
            unique_nodes.append(node)

# add label columns to dataframe
for relation in unique_relations:
    trainFile[relation] = 0

# fill out label columns
for idx, relation_str in enumerate(trainFile['CORE RELATIONS']):
    relations = relation_str.split(' ')
    for relation in relations:
        trainFile.loc[idx,relation] = 1

#separate label columns
labels = trainFile[unique_relations]

# target values
y = labels.values

In [4]:
# 5000 words because the training set has almost 2500 unique words
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_matrix(X)

In [5]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return 1 - K.mean(f1)

In [6]:
def create_model(hyperParams):
    
    hidden_layers = hyperParams['hidden_layers']
    activation = hyperParams['activation']
    dropout = hyperParams['dropout']
    output_activation = hyperParams['output_activation']
    loss = hyperParams['loss']
    
    model = Sequential()
    model.add(Dense(hidden_layers[0], input_shape=(5000,), activation=activation))
    model.add(Dropout(dropout))
    for i in range(len(hidden_layers)-1):
        model.add(Dense(hidden_layers[i], activation=activation))
        model.add(Dropout(dropout))
    model.add(Dense(46, activation=output_activation))
    model.compile(loss=loss, optimizer='adam', metrics=['accuracy', f1_score])
    # categorical_crossentropy, binary_crossentropy f1_loss->(for tensorflow 1.14)
    
    return model

In [7]:
def cv_model_fit(X, y, hyperParams):
    
    kfold = KFold(n_splits=10, shuffle=True)
    scores=[]
    for train_idx, test_idx in kfold.split(X):
        model = create_model(hyperParams)
        model.fit(X[train_idx], y[train_idx], batch_size=hyperParams['batch_size'], 
                  epochs=hyperParams['epochs'], verbose=0)
        score = model.evaluate(X[test_idx], y[test_idx], verbose=0)
        scores.append(score[2]*100) # f_score
        print('fold ', len(scores), '  score: ', scores[-1])
        del model
        
    return scores

In [8]:
# hyper parameter optimization
from itertools import product

hyperParams = {'batch_size': 32, 'epochs': 25, 'hidden_layers': [512, 512], 'activation': 'relu', 'dropout': 0.3,
              'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'}

epochs_choices = [15, 25, 40, 100]
activation_choices = ['relu', 'sigmoid', 'tanh']
hidden_layers_choices = [[512, 512], [128, 128, 128], [256, 256, 256], [512, 512, 512],
                         [256, 512, 256], [128, 256, 128], [128, 512, 128], [512, 512, 128]]

s = [epochs_choices, activation_choices, hidden_layers_choices]
perms = list(product(*s))  # permutations

best_score = 0
for row in perms:
    hyperParams['epochs'] = row[0]
    hyperParams['activation'] = row[1]
    hyperParams['hidden_layers'] = row[2]
    print('10-fold cross validation on these hyperparameters: ', hyperParams, '\n')
    cvscores = cv_model_fit(X, y, hyperParams)
    print('\n-------------------------------------------')
    mean_score = np.mean(cvscores)
    std_score = np.std(cvscores)
    print('CV mean: {0:0.4f},  CV std: {1:0.4f}'.format(mean_score, std_score))
    if  mean_score > best_score:    # later I should incorporate std in best model selection
        best_score = mean_score
        print('****** Best model so far ******')
        best_params = hyperParams
    print('-------------------------------------------\n')

10-fold cross validation on these hyperparameters:  {'batch_size': 32, 'epochs': 100, 'hidden_layers': [512, 512], 'activation': 'relu', 'dropout': 0.3, 'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'} 

fold  1   score:  83.59819650650024
fold  2   score:  83.23379158973694
fold  3   score:  83.97870659828186
fold  4   score:  79.5464277267456
fold  5   score:  89.36113715171814
fold  6   score:  84.44141745567322
fold  7   score:  84.6139907836914
fold  8   score:  86.14341616630554
fold  9   score:  85.03707647323608
fold  10   score:  82.50978589057922

-------------------------------------------
CV mean: 84.2464,  CV std: 2.3912
****** Best model so far ******
-------------------------------------------

10-fold cross validation on these hyperparameters:  {'batch_size': 32, 'epochs': 100, 'hidden_layers': [128, 128, 128], 'activation': 'relu', 'dropout': 0.3, 'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'} 

fold  1   score:  84.35253500938416
fold  2 

In [8]:
best_params = {'batch_size': 32, 'epochs': 40, 'hidden_layers': [512, 512, 128], 'activation': 'relu', 
               'dropout': 0.5, 'output_activation': 'sigmoid', 'loss': 'binary_crossentropy'} 
# build last model from full data
model = create_model(best_params)
model.fit(X, y, batch_size=best_params['batch_size'], epochs=best_params['epochs'], verbose=1)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<tensorflow.python.keras.callbacks.History at 0x10b99a278>

In [9]:
X_t = list(testFile['UTTERANCE'])
X_pred = tokenizer.texts_to_matrix(X_t)

In [10]:
y_pred = model.predict(X_pred)

In [11]:
predictions = []
for y in y_pred:
    temp = []
    for i,v in enumerate(y):
        if v >= 0.5:
            temp.append(unique_relations[i])
    if len(temp) < 1:   # select max if no prob >= 0.5
        temp.append(unique_relations[np.argmax(y)])
    if (len(temp) > 1) and ('other' in temp):   # remove 'other' if more than 1 labels
        temp.remove('other')
    if (len(temp) > 1) and ('NO_REL' in temp):   # remove 'NO_REL' if more than 1 labels
        temp.remove('NO_REL')
    if (len(temp) > 1) and ('movie_other' in temp):   # remove 'movie_other' if more than 1 labels
        temp.remove('movie_other')
    predictions.append(' '.join(temp))

In [12]:
submissionFile = pd.DataFrame({
    'ID': [i for i in range(len(predictions))],
    'CORE RELATIONS': predictions})
path_to_save = os.path.abspath(os.getcwd()) + '/predictions/'
submissionFile.to_csv('predictions/prediction17.csv', index = None)
submissionFile.head()

Unnamed: 0,CORE RELATIONS,ID
0,movie.starring.actor,0
1,movie.starring.actor,1
2,movie.starring.actor,2
3,movie.starring.actor,3
4,movie.starring.actor,4
