In [3]:
%load_ext tensorboard

In [4]:
import numpy as np
import pandas as pd 
from ast import literal_eval
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Dense, LSTM, MaxPooling1D, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras.metrics import AUC
import datetime, os

In [68]:
MAX_VOCAB_SIZE = 200000 # there are 563693 words in the vocabulary
MAX_LEN_SEQ = 300
TRAIN_TEXT_COL = 'comment_text_clean2'
TEST_TEXT_COL = 'comment_text_clean2'
TRAIN_TARGET_COL = 'target'
TEST_TARGET_COL = 'target'
EMBED_DIM = 300
EMBEDDING_FILE = 'embeds/glove.840B.300d.txt'

DROPOUT_RATE = 0.2
LSTM_UNITS = 128
BATCH_SIZE = 128
NUM_EPOCHS = 4
CHECKPOINT_PATH = "NN_models/cp.ckpt"
CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)


In [19]:
#Breaking down how to access our S3 Bucket files. 
#Put in your own bucket name
bucket = 'gs-capstone' 

#the path to the file you want to load in your S3 Bucket
dataset_file_path_train = 'train_for_nn.csv'
dataset_file_path_test = 'test_for_nn.csv'

#Creating the path, and combining the above
path_train = 's3://{}/{}'.format(bucket, dataset_file_path_train)
path_test = 's3://{}/{}'.format(bucket, dataset_file_path_test) 

In [20]:
import boto3
s3 = boto3.client('s3')
s3.download_file(bucket, dataset_file_path_train, 'train_for_nn.csv')
s3.download_file(bucket, dataset_file_path_test, 'test_for_nn.csv')

train_data = pd.read_csv('train_for_nn.csv', converters={"comment_text_clean2": literal_eval})
test_data = pd.read_csv('test_for_nn.csv', converters={"comment_text_clean2": literal_eval})

In [106]:
test_data2 = pd.read_csv('test_for_nn.csv', converters={"comment_text_clean2": literal_eval})

In [107]:
test_data2.shape

(194640, 15)

In [108]:
test_data2['id'].shape

(194640,)

In [104]:
test_data.shape

(194640, 15)

In [105]:
test_data['id'].shape

(194642,)

In [21]:
# Create train val split, stratify on target
train_df, val_df = train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=1)

In [47]:
# Create and fix tokenizer
def train_tokenizer(train_data, vocab_size):
    # Use Keras tokenizer to create vocabulary dictionary 
    # default arguments will filter punctuation and convert to lower, we do not want this given our use 
    # of pre-trained word embeddings
    tokenizer = text.Tokenizer(num_words = vocab_size, filters='', lower=False)
    tokenizer.fit_on_texts(train_data)
    return tokenizer

# pad tokenized sequences
def text_padder(text, tokenizer):
    return sequence.pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_LEN_SEQ)

# Build embedding matrix
def build_embedding_matrix(word_indexes, EMBEDDING_FILE):
  
    # Used to store words as key and vectors as value
    embedding_dict = {}
    with open(EMBEDDING_FILE) as file:
        # file is formatted word {whitespace} vector
        for line in file:
            pairs = line.split(' ')
           # word is 0 index of pairs
            word = pairs[0]
            vec = pairs[1:]
           #convert vec into a numpy array
            vec = np.asarray(vec, dtype=np.float32)
            embedding_dict[word] = vec
    
    #create the embedding matrix which has dimensions:
    # MAX_VOCAB_SIZE +1 for rows, this means there will be as many rows as words we allow to be part of the feature set.
    # EMBED_DIM is the number of columns, this reflects the dimensions of the word embedding vectors we are using.
    embedding_matrix = np.zeros((len(word_indexes)+1, EMBED_DIM))


    word_count = 0
    for word, i in word_indexes.items():
        # gets the vector to the corresponding word from the previous dictionary and sets it to the variable
        embedding_vector = embedding_dict.get(word)
        # We check whether the embedding_vector is not none (i.e the word is in the embedding index)
        if embedding_vector is not None:
            word_count += 1
            # Append the embedding vector to index i in the embedding matrix 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix
            
def build_embedding_matrix_restricted(word_indexes, EMBEDDING_FILE):
  
    # Used to store words as key and vectors as value
    embedding_dict = {}
    with open(EMBEDDING_FILE) as file:
        # file is formatted word {whitespace} vector
        for line in file:
            pairs = line.split(' ')
           # word is 0 index of pairs
            word = pairs[0]
            vec = pairs[1:]
           #convert vec into a numpy array
            vec = np.asarray(vec, dtype=np.float32)
            embedding_dict[word] = vec
    
    #create the embedding matrix which has dimensions:
    # MAX_VOCAB_SIZE +1 for rows, this means there will be as many rows as words we allow to be part of the feature set.
    # EMBED_DIM is the number of columns, this reflects the dimensions of the word embedding vectors we are using.
    embedding_matrix = np.zeros((MAX_VOCAB_SIZE+1, EMBED_DIM))

    
    word_count = 0
  
    for word, i in word_indexes.items():
        if word_count <= MAX_VOCAB_SIZE:
            # gets the vector to the corresponding word from the previous dictionary and sets it to the variable
            embedding_vector = embedding_dict.get(word)
            # We check whether the embedding_vector is not none (i.e the word is in the embedding index)
            if embedding_vector is not None:
                word_count += 1
                # Append the embedding vector to index i in the embedding matrix 
                embedding_matrix[i] = embedding_vector
        else:
            break
    return embedding_matrix

In [36]:
tokenizer = train_tokenizer(train_df[TRAIN_TEXT_COL], MAX_VOCAB_SIZE)

In [38]:
len(tokenizer.word_index)

494877

In [59]:
# build model

# NOTE: WITH TF2.0 CUDNNLSTM is active by default when there is a GPU available but you must use the default settings.
# SEE https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM for more details

def build_model(embedding_matrix):
    # change to max word length 
    input_words = Input(shape=(MAX_LEN_SEQ,), dtype='int32')
    embedding = Embedding(len(tokenizer.word_index)+1, EMBED_DIM,
                          weights=[embedding_matrix],
                          input_length = MAX_LEN_SEQ,
                          #mask_zero = True
                          trainable = False) (input_words)
    x = Dropout(DROPOUT_RATE)(embedding)
    x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x) #set return_sequence to false when passing to dense
    #x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x)
    
    # Use GlobalMaxPooling
    x = GlobalMaxPooling1D()(x)
    
    # Pass into DENSE layers 
    # Dense nodes total has been calculated as per 
    # https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm
    # (300,000)/5*(128+2) = 462
    x = Dense(462, activation='relu')(x)
    prediction = Dense(2, activation='sigmoid')(x)
    
    model = Model(inputs=input_words, outputs=prediction, name='baseline-LSTM')
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', AUC()])
    
    return model
                           
def train_model(train_df, val_df, tokenizer):
    # Create processed and padded train and targets
    print('padding_text')
    X_train = text_padder(train_df[TRAIN_TEXT_COL], tokenizer)
    X_val = text_padder(val_df[TRAIN_TEXT_COL], tokenizer)
    y_train = to_categorical(train_df[TRAIN_TARGET_COL])
    y_val = to_categorical(val_df[TRAIN_TARGET_COL])
    
    print('building embedding matrix')
    # build embedding matrix
    embed_matrix = build_embedding_matrix(tokenizer.word_index, EMBEDDING_FILE)
    
    # build model
    print('building model')
    model = build_model(embed_matrix)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH,
                                                 save_weights_only=True,
                                                 verbose=1)
    
    # Connect to tensorboard
    logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1, write_images=True, 
                                                          write_graph=False
                                                          )
    # train model
    print('training model')
    fitted_model = model.fit(X_train, y_train,
                             batch_size = BATCH_SIZE,
                             epochs = NUM_EPOCHS,
                             validation_data=(X_val, y_val),
                             callbacks=[cp_callback, tensorboard_callback],
                             verbose = 1)
    #save full model 
    model.save('saved_model/baseline-LSTM') 
    #saves to h5
    model.save('saved_model/baseline-LSTM.h5')
    
    #save weights
    model.save_weights('saved_weights/baseline-LSTM')
    model.save_weights('saved_weights/baseline-LSTM.h5')
  
    return model, fitted_model
    
    
    

In [60]:
def build_model_vocab_restricted(embedding_matrix):
    # change to max word length 
    input_words = Input(shape=(MAX_LEN_SEQ,), dtype='int32')
    embedding = Embedding(MAX_VOCAB_SIZE+1, EMBED_DIM,
                          weights=[embedding_matrix],
                          input_length = MAX_LEN_SEQ,
                          #mask_zero = True
                          trainable = False) (input_words)
    x = Dropout(DROPOUT_RATE)(embedding)
    x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x) #set return_sequence to false when passing to dense
    #x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x)
    
    # Use GlobalMaxPooling
    x = GlobalMaxPooling1D()(x)
    
    # Pass into DENSE layers 
    # Dense nodes total has been calculated as per 
    # https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm
    # (300,000)/5*(128+2) = 462
    x = Dense(462, activation='relu')(x)
    prediction = Dense(2, activation='sigmoid')(x)
    
    model = Model(inputs=input_words, outputs=prediction, name='baseline-LSTM')
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', AUC()])
    
    return model
                           
def train_model_restricted(train_df, val_df, tokenizer):
    # Create processed and padded train and targets
    print('padding_text')
    X_train = text_padder(train_df[TRAIN_TEXT_COL], tokenizer)
    X_val = text_padder(val_df[TRAIN_TEXT_COL], tokenizer)
    y_train = to_categorical(train_df[TRAIN_TARGET_COL])
    y_val = to_categorical(val_df[TRAIN_TARGET_COL])
    
    print('building embedding matrix')
    # build embedding matrix
    embed_matrix = build_embedding_matrix_restricted(tokenizer.word_index, EMBEDDING_FILE)
    
    # build model
    print('building model')
    model = build_model_vocab_restricted(embed_matrix)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH,
                                                 save_weights_only=True,
                                                 verbose=1)
    
    # Connect to tensorboard
    logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1, write_images=True, write_graph=False
                                                          )
    # train model
    print('training model')
    fitted_model = model.fit(X_train, y_train,
                             batch_size = BATCH_SIZE,
                             epochs = NUM_EPOCHS,
                             validation_data=(X_val, y_val),
                             callbacks=[cp_callback, tensorboard_callback],
                             verbose = 1)
    
    #save full model 
    model.save('saved_nn_model/baseline-LSTM') 
    #saves to h5
    model.save('saved_nn_model/baseline-LSTM.h5')
    
    #save weights
    model.save_weights('saved_weights/baseline-LSTM')
    model.save_weights('saved_weights/baseline-LSTM.h5')
    
    return model, fitted_model
    
    
    

In [28]:
%%time
tokenizer = train_tokenizer(train_df[TRAIN_TEXT_COL], MAX_VOCAB_SIZE)

CPU times: user 56.2 s, sys: 32 ms, total: 56.3 s
Wall time: 56.3 s


In [29]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [61]:
model, fitted_model = train_model(train_df, val_df, tokenizer)

padding_text
building embedding matrix
building model
training model
Train on 1443899 samples, validate on 360975 samples
Epoch 1/4
Epoch 00001: saving model to NN_models/cp.ckpt
Epoch 2/4
Epoch 00002: saving model to NN_models/cp.ckpt
Epoch 3/4
Epoch 00003: saving model to NN_models/cp.ckpt
Epoch 4/4
Epoch 00004: saving model to NN_models/cp.ckpt
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: saved_model/baseline-LSTM/assets


In [71]:
# Pass trained tokenizer to convert test results to sequences
X_test = text_padder(test_data[TEST_TEXT_COL], tokenizer)

#convert target col to categorical 
y_test = to_categorical(test_data[TEST_TARGET_COL])

In [None]:
x_

In [98]:
test_data['comment_text'].count()

194640

In [96]:
179192+15448

194640

In [102]:
test_data.duplicated(columns=['id'])

TypeError: duplicated() got an unexpected keyword argument 'columns'

In [89]:
test_data.isnull().sum()

Unnamed: 0                       0
id                               0
target                           0
comment_text                     0
comment_text_clean               1
comment_text_clean2              0
male                             0
female                           0
homosexual_gay_or_lesbian        0
christian                        0
jewish                           0
muslim                           0
black                            0
white                            0
psychiatric_or_mental_illness    0
dtype: int64

In [85]:
test_data['id'].shape

(194642,)

In [83]:
X_test.shape

(194640, 300)

In [72]:
# evaluate on test set
test_evaluate = model.evaluate(X_test, y_test, batch_size = BATCH_SIZE)



In [73]:
test_preds = model.predict(X_test)

In [74]:
# we want all rows and second column
test_preds

array([[9.7055274e-01, 3.0022413e-02],
       [9.9992037e-01, 7.6860189e-05],
       [9.9748015e-01, 2.4578571e-03],
       ...,
       [5.6042463e-01, 4.3444389e-01],
       [6.1085480e-01, 3.9308065e-01],
       [9.9860001e-01, 1.3416409e-03]], dtype=float32)

In [109]:
test_pred_results = pd.DataFrame(test_data2['id'])

In [110]:
test_pred_results.shape

(194640, 1)

In [82]:
test_preds.shape

(194640, 2)

In [111]:
test_pred_results['prediction_prob_0'] = test_preds[:,0]
test_pred_results['prediction_prob_1'] = test_preds[:,1]

In [112]:
test_pred_results


Unnamed: 0,id,prediction_prob_0,prediction_prob_1
0,7000000,0.970553,0.030022
1,7000001,0.999920,0.000077
2,7000002,0.997480,0.002458
3,7000003,0.998343,0.001602
4,7000004,0.051896,0.947942
5,7000005,0.999934,0.000066
6,7000006,0.998190,0.001771
7,7000007,0.996850,0.003145
8,7000008,0.958121,0.041180
9,7000009,0.996766,0.003053


In [114]:
test_pred_results.to_csv('test_pred_results.csv')

In [119]:
# also save to s3
s3 = boto3.client('s3')
s3.upload_file('test_pred_results.csv',bucket,'test_pred_results.csv')

In [65]:
model.save('saved_nn_model/baseline-LSTM')

INFO:tensorflow:Assets written to: saved_nn_model/baseline-LSTM/assets
