# Neural Network Models

In this notebook we have specified the build for our LSTM model.
We have used tensorflow 2.0. If you are training the model, it is highly recommended to use a GPU and have at least ~32gb of RAM. 

In [3]:
%load_ext tensorboard

In [1]:
# Import general modules
import numpy as np
import pandas as pd 
from ast import literal_eval
import datetime, os

In [2]:
# Import scikit learn modules
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [3]:
# Import tensorflow and keras modules. NOTE: WE ARE USING TENSORFLOW 2.0
import tensorflow as tf
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Embedding, Dense, LSTM, MaxPooling1D, Input, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import AUC


In [23]:
MAX_VOCAB_SIZE = 200000 # there are 563693 words in the vocabulary
MAX_LEN_SEQ = 300
TRAIN_TEXT_COL = 'comment_text_clean2'
TEST_TEXT_COL = 'comment_text_clean2'
TRAIN_TARGET_COL = 'target'
TEST_TARGET_COL = 'target'
EMBED_DIM = 300
EMBEDDING_FILE = 'glove.840B.300d.txt'

DROPOUT_RATE = 0.2
LSTM_UNITS = 128
BATCH_SIZE = 128
NUM_EPOCHS = 4
CHECKPOINT_PATH = "NN_models/cp.ckpt"
CHECKPOINT_DIR = os.path.dirname(CHECKPOINT_PATH)


#### Loading data from S3 for cloud computing

In [19]:
#Breaking down how to access our S3 Bucket files. 
#Put in your own bucket name
bucket = 'gs-capstone' 

#the path to the file you want to load in your S3 Bucket
dataset_file_path_train = 'train_for_nn.csv'
dataset_file_path_test = 'test_for_nn.csv'

#Creating the path, and combining the above
path_train = 's3://{}/{}'.format(bucket, dataset_file_path_train)
path_test = 's3://{}/{}'.format(bucket, dataset_file_path_test) 

In [20]:
import boto3
s3 = boto3.client('s3')
s3.download_file(bucket, dataset_file_path_train, 'train_for_nn.csv')
s3.download_file(bucket, dataset_file_path_test, 'test_for_nn.csv')

# When the data set was saved as a CSV, tokenized column, which was a list was coverted to a string, 
# The converters option changes this back into its list form 
train_data = pd.read_csv('train_for_nn.csv', converters={"comment_text_clean2": literal_eval})
test_data = pd.read_csv('test_for_nn.csv', converters={"comment_text_clean2": literal_eval})

#### Loading data from local machine

In [6]:
#train_data = pd.read_csv('train_for_nn.csv', converters={"comment_text_clean2": literal_eval})
test_data = pd.read_csv('test_for_nn.csv', converters={"comment_text_clean2": literal_eval})

In [11]:
# Drop unnamed col 
train_data.drop(['Unnamed: 0'], axis=1, inplace=True)
test_data.drop(['Unnamed: 0'], axis=1, inplace=True)

In [14]:
# Create train val split, stratify on target
train_df, val_df = train_test_split(train_data, test_size=0.2, stratify=train_data['target'], random_state=1)

In [15]:
# Create and fix tokenizer
def train_tokenizer(train_data, vocab_size):
    # Use Keras tokenizer to create vocabulary dictionary 
    # default arguments will filter punctuation and convert to lower, we do not want this given our use 
    # of pre-trained word embeddings
    tokenizer = text.Tokenizer(num_words = vocab_size, filters='', lower=False)
    tokenizer.fit_on_texts(train_data)
    return tokenizer

# pad tokenized sequences
def text_padder(text, tokenizer):
    return sequence.pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_LEN_SEQ)

# Build embedding matrix
def build_embedding_matrix(word_indexes, EMBEDDING_FILE):
  
    # Used to store words as key and vectors as value
    embedding_dict = {}
    with open(EMBEDDING_FILE) as file:
        # file is formatted word {whitespace} vector
        for line in file:
            pairs = line.split(' ')
           # word is 0 index of pairs
            word = pairs[0]
            vec = pairs[1:]
           #convert vec into a numpy array
            vec = np.asarray(vec, dtype=np.float32)
            embedding_dict[word] = vec
    
    #create the embedding matrix which has dimensions:
    # MAX_VOCAB_SIZE +1 for rows, this means there will be as many rows as words we allow to be part of the feature set.
    # EMBED_DIM is the number of columns, this reflects the dimensions of the word embedding vectors we are using.
    embedding_matrix = np.zeros((len(word_indexes)+1, EMBED_DIM))


    word_count = 0
    for word, i in word_indexes.items():
        # gets the vector to the corresponding word from the previous dictionary and sets it to the variable
        embedding_vector = embedding_dict.get(word)
        # We check whether the embedding_vector is not none (i.e the word is in the embedding index)
        if embedding_vector is not None:
            word_count += 1
            # Append the embedding vector to index i in the embedding matrix 
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix
            
def build_embedding_matrix_restricted(word_indexes, EMBEDDING_FILE):
  
    # Used to store words as key and vectors as value
    embedding_dict = {}
    with open(EMBEDDING_FILE) as file:
        # file is formatted word {whitespace} vector
        for line in file:
            pairs = line.split(' ')
           # word is 0 index of pairs
            word = pairs[0]
            vec = pairs[1:]
           #convert vec into a numpy array
            vec = np.asarray(vec, dtype=np.float32)
            embedding_dict[word] = vec
    
    #create the embedding matrix which has dimensions:
    # MAX_VOCAB_SIZE +1 for rows, this means there will be as many rows as words we allow to be part of the feature set.
    # EMBED_DIM is the number of columns, this reflects the dimensions of the word embedding vectors we are using.
    embedding_matrix = np.zeros((MAX_VOCAB_SIZE+1, EMBED_DIM))

    
    word_count = 0
  
    for word, i in word_indexes.items():
        if word_count <= MAX_VOCAB_SIZE:
            # gets the vector to the corresponding word from the previous dictionary and sets it to the variable
            embedding_vector = embedding_dict.get(word)
            # We check whether the embedding_vector is not none (i.e the word is in the embedding index)
            if embedding_vector is not None:
                word_count += 1
                # Append the embedding vector to index i in the embedding matrix 
                embedding_matrix[i] = embedding_vector
        else:
            break
    return embedding_matrix

In [19]:
# build model

# NOTE: WITH TF2.0 CUDNNLSTM is active by default when there is a GPU available but you must use the default settings.
# SEE https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM for more details

def build_model(embedding_matrix):
    # change to max word length 
    input_words = Input(shape=(MAX_LEN_SEQ,), dtype='int32')
    embedding = Embedding(len(tokenizer.word_index)+1, EMBED_DIM,
                          weights=[embedding_matrix],
                          input_length = MAX_LEN_SEQ,
                          #mask_zero = True
                          trainable = False) (input_words)
    x = Dropout(DROPOUT_RATE)(embedding)
    x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x) #set return_sequence to false when passing to dense
    #x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x)
    
    # Use GlobalMaxPooling
    x = GlobalMaxPooling1D()(x)
    
    # Pass into DENSE layers 
    # Dense nodes total has been calculated as per 
    # https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm
    # (300,000)/5*(128+2) = 462
    x = Dense(462, activation='relu')(x)
    prediction = Dense(2, activation='sigmoid')(x)
    
    model = Model(inputs=input_words, outputs=prediction, name='baseline-LSTM')
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', AUC()])
    
    return model
                           
def train_model(train_df, val_df, tokenizer):
    # Create processed and padded train and targets
    print('padding_text')
    X_train = text_padder(train_df[TRAIN_TEXT_COL], tokenizer)
    X_val = text_padder(val_df[TRAIN_TEXT_COL], tokenizer)
    y_train = to_categorical(train_df[TRAIN_TARGET_COL])
    y_val = to_categorical(val_df[TRAIN_TARGET_COL])
    
    print('building embedding matrix')
    # build embedding matrix
    embed_matrix = build_embedding_matrix(tokenizer.word_index, EMBEDDING_FILE)
    
    # build model
    print('building model')
    model = build_model(embed_matrix)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH,
                                                 save_weights_only=True,
                                                 verbose=1)
    
    # Connect to tensorboard
    #logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    #tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1, write_images=True, 
                                                          #write_graph=False
                                                          #)
    # train model
    print('training model')
    fitted_model = model.fit(X_train, y_train,
                             batch_size = BATCH_SIZE,
                             epochs = NUM_EPOCHS,
                             validation_data=(X_val, y_val),
                             callbacks=[cp_callback],
                             verbose = 1)
    
    #save full model 
    #model.save('saved_model/baseline-LSTM') 
    #saves to h5
    #model.save('saved_model/baseline-LSTM.h5')
    
    #save weights
    #model.save_weights('saved_weights/baseline-LSTM')
    #model.save_weights('saved_weights/baseline-LSTM.h5')
  
    return model, fitted_model

    

In [17]:
def build_model_vocab_restricted(embedding_matrix):
    # change to max word length 
    input_words = Input(shape=(MAX_LEN_SEQ,), dtype='int32')
    embedding = Embedding(MAX_VOCAB_SIZE+1, EMBED_DIM,
                          weights=[embedding_matrix],
                          input_length = MAX_LEN_SEQ,
                          #mask_zero = True
                          trainable = False) (input_words)
    x = Dropout(DROPOUT_RATE)(embedding)
    x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x) #set return_sequence to false when passing to dense
    #x = Bidirectional(LSTM(128, activation='tanh', return_sequences=True))(x)
    
    # Use GlobalMaxPooling
    x = GlobalMaxPooling1D()(x)
    
    # Pass into DENSE layers 
    # Dense nodes total has been calculated as per 
    # https://ai.stackexchange.com/questions/3156/how-to-select-number-of-hidden-layers-and-number-of-memory-cells-in-an-lstm
    # (300,000)/5*(128+2) = 462
    x = Dense(462, activation='relu')(x)
    prediction = Dense(2, activation='sigmoid')(x)
    
    model = Model(inputs=input_words, outputs=prediction, name='baseline-LSTM')
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', AUC()])
    
    return model
                           
def train_model_restricted(train_df, val_df, tokenizer):
    # Create processed and padded train and targets
    print('padding_text')
    X_train = text_padder(train_df[TRAIN_TEXT_COL], tokenizer)
    X_val = text_padder(val_df[TRAIN_TEXT_COL], tokenizer)
    y_train = to_categorical(train_df[TRAIN_TARGET_COL])
    y_val = to_categorical(val_df[TRAIN_TARGET_COL])
    
    print('building embedding matrix')
    # build embedding matrix
    embed_matrix = build_embedding_matrix_restricted(tokenizer.word_index, EMBEDDING_FILE)
    
    # build model
    print('building model')
    model = build_model_vocab_restricted(embed_matrix)
    
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT_PATH,
                                                 save_weights_only=True,
                                                 verbose=1)
    
    # Connect to tensorboard
    logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1, write_images=True, write_graph=False
                                                          )
    # train model
    print('training model')
    fitted_model = model.fit(X_train, y_train,
                             batch_size = BATCH_SIZE,
                             epochs = NUM_EPOCHS,
                             validation_data=(X_val, y_val),
                             callbacks=[cp_callback, tensorboard_callback],
                             verbose = 1)
    
    #save full model 
    #model.save('saved_nn_model/baseline-LSTM') 
    #saves to h5
    #model.save('saved_nn_model/baseline-LSTM.h5')
    
    #save weights
    #model.save_weights('saved_weights/baseline-LSTM')
    #model.save_weights('saved_weights/baseline-LSTM.h5')
    
    return model, fitted_model
    
    
    

In [20]:
%%time
tokenizer = train_tokenizer(train_df[TRAIN_TEXT_COL], MAX_VOCAB_SIZE)

CPU times: user 1min 5s, sys: 160 ms, total: 1min 5s
Wall time: 1min 5s


In [21]:
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [24]:
model, fitted_model = train_model(train_df, val_df, tokenizer)

padding_text
building embedding matrix
building model
training model
Train on 1443899 samples, validate on 360975 samples
Epoch 1/4
Epoch 00001: saving model to NN_models/cp.ckpt
Epoch 2/4
Epoch 00002: saving model to NN_models/cp.ckpt
Epoch 3/4
Epoch 00003: saving model to NN_models/cp.ckpt
Epoch 4/4
Epoch 00004: saving model to NN_models/cp.ckpt


In [26]:
#save full model 
#model.save('saved_model/baseline-LSTM') 
#saves to h5
model.save('saved_model_h5/baseline-LSTM.h5')
    
#save weights
#model.save_weights('saved_weights/baseline-LSTM')
#model.save_weights('saved_weights_h5/baseline-LSTM.h5')

In [27]:
# Pass trained tokenizer to convert test results to sequences
X_test = text_padder(test_data[TEST_TEXT_COL], tokenizer)

#convert target col to categorical 
y_test = to_categorical(test_data[TEST_TARGET_COL])

In [28]:
# evaluate on test set
test_evaluate = model.evaluate(X_test, y_test, batch_size = BATCH_SIZE)



In [29]:
test_preds = model.predict(X_test)

In [74]:
# we want all rows and second column
test_preds

array([[9.7055274e-01, 3.0022413e-02],
       [9.9992037e-01, 7.6860189e-05],
       [9.9748015e-01, 2.4578571e-03],
       ...,
       [5.6042463e-01, 4.3444389e-01],
       [6.1085480e-01, 3.9308065e-01],
       [9.9860001e-01, 1.3416409e-03]], dtype=float32)

In [31]:
test_pred_results = pd.DataFrame(test_data['id'])

In [32]:
test_pred_results['prediction_prob_0'] = test_preds[:,0]
test_pred_results['prediction_prob_1'] = test_preds[:,1]

In [33]:
#save results to csv
test_pred_results.to_csv('test_pred_results.csv')

In [36]:
model2 = load_model('saved_model_h5/baseline-LSTM.h5')

In [51]:
string_test = [['hi', 'my', 'fellow', 'stranger']]
# input needs to be passed as a nested list 
model2.predict(text_padder(string_test, tokenizer))

array([[0.9975815 , 0.00222306]], dtype=float32)

In [53]:
string_test = [['hi my fellow stranger']]
# input needs to be passed as a nested list 
model2.predict(text_padder(string_test, tokenizer))

array([[0.99402195, 0.00534694]], dtype=float32)

In [39]:
model3 = load_model('saved_model/baseline-LSTM')

In [46]:
test_data['comment_text_clean2'].head()

0    [Jeff, Sessions, is, another, one, of, Trump, ...
1    [I, actually, inspected, the, infrastructure, ...
2    [No, it, won, t, ., That, just, wishful, think...
3    [Instead, of, wringing, our, hands, and, nibbl...
4    [how, many, of, you, commenters, have, garbage...
Name: comment_text_clean2, dtype: object

In [119]:
# also save to s3
s3 = boto3.client('s3')
s3.upload_file('test_pred_results.csv',bucket,'test_pred_results.csv')

In [65]:
model.save('saved_nn_model/baseline-LSTM')

INFO:tensorflow:Assets written to: saved_nn_model/baseline-LSTM/assets


#### Assessing Model Performance

In [11]:
# load in test predictions
test_preds = pd.read_csv('test_pred_results.csv')

In [20]:
#drop unnamed column
test_preds.drop('Unnamed: 0', axis=1, inplace=True)
test_data.drop('Unnamed: 0', axis=1, inplace=True)

In [21]:
#merge the predictions onto the test dataframe on id
test_results = test_data.merge(test_preds, how='inner', on='id')

In [41]:
# define identity columns
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
# convert identity and target columns to boolean
for col in identity_columns + ['target']:
    #train_df[col] = np.where(train_df[col] >= 0.5, True, False)
    test_results[col] = np.where(test_results[col] >= 0.5, True, False)
    
# create a binary col for prediction of class 1 (toxic)
test_results['prediction_binary'] = np.where(test_results['prediction_prob_1'] >= 0.5, True, False)

In [46]:
# store the precision, recall, and f1 score for later and print the classification report
nn_precision = precision_score(test_results['target'], test_results['prediction_binary'])
nn_recall = recall_score(test_results['target'], test_results['prediction_binary'])
nn_f1 = f1_score(test_results['target'], test_results['prediction_binary'])

print(classification_report(test_results['target'], test_results['prediction_binary']))

              precision    recall  f1-score   support

       False       0.97      0.98      0.97    179192
        True       0.74      0.63      0.68     15448

    accuracy                           0.95    194640
   macro avg       0.86      0.81      0.83    194640
weighted avg       0.95      0.95      0.95    194640



We can see that the model is very strong at predicting the false class, however not as adept at the cases where toxicity is the case. A recall score of 0.63 suggests we are letting through a number of cases of toxic commentary. This is most likely due to the very large class imbalance we noted during our EDA. The model only has a few cases of toxic comments to train on compared to non-toxic which impairs its ability to learn about what constitutes a toxic comment. 

We will run all models once again with up-sampling and down-sampling applied and see whether this leads to a better preicsion and recall for the positive class. 

In [24]:
# Define subgroup metrics

SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'  # stands for background negative, subgroup positive


# These calculations have been provided by Jigsaw AI for scoring based on the metrics of the kaggle competition
# https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/overview/evaluation

# They work by filtering the relevant dataframe into specific subgroups and using the roc_auc_score metric from sklearn.

def compute_auc(y_true, y_pred):
    try:
        return metrics.roc_auc_score(y_true, y_pred)
    except ValueError:
        return np.nan

def compute_subgroup_auc(df, subgroup, label, model_name):
    subgroup_examples = df[df[subgroup]]
    return compute_auc(subgroup_examples[label], subgroup_examples[model_name])

def compute_bpsn_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup negative examples and the background positive examples."""
    subgroup_negative_examples = df.loc[df[subgroup] & ~df[label]]
    non_subgroup_positive_examples = df.loc[~df[subgroup] & df[label]]
    examples = subgroup_negative_examples.append(non_subgroup_positive_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bnsp_auc(df, subgroup, label, model_name):
    """Computes the AUC of the within-subgroup positive examples and the background negative examples."""
    subgroup_positive_examples = df.loc[df[subgroup] & df[label]]
    non_subgroup_negative_examples = df.loc[~df[subgroup] & ~df[label]]
    examples = subgroup_positive_examples.append(non_subgroup_negative_examples)
    return compute_auc(examples[label], examples[model_name])

def compute_bias_metrics_for_model(dataset,
                                   subgroups,
                                   model,
                                   label_col,
                                   include_asegs=False):
    """Computes per-subgroup metrics for all subgroups and one model."""
    records = []
    for subgroup in subgroups:
        record = {
            'subgroup': subgroup,
            'subgroup_size': len(dataset.loc[dataset[subgroup]])
        }
        record[SUBGROUP_AUC] = compute_subgroup_auc(dataset, subgroup, label_col, model)
        record[BPSN_AUC] = compute_bpsn_auc(dataset, subgroup, label_col, model)
        record[BNSP_AUC] = compute_bnsp_auc(dataset, subgroup, label_col, model)
        records.append(record)
    return pd.DataFrame(records).sort_values('subgroup_auc', ascending=True)



In [25]:
def calculate_overall_auc(df, model_name):
    true_labels = df[TOXICITY_COLUMN]
    predicted_labels = df[model_name]
    return metrics.roc_auc_score(true_labels, predicted_labels)

def power_mean(series, p):
    total = sum(np.power(series, p))
    return np.power(total / len(series), 1 / p)

def get_final_metric(bias_df, overall_auc, POWER=-5, OVERALL_MODEL_WEIGHT=0.25):
    bias_score = np.average([
        power_mean(bias_df[SUBGROUP_AUC], POWER),
        power_mean(bias_df[BPSN_AUC], POWER),
        power_mean(bias_df[BNSP_AUC], POWER)
    ])
    return (OVERALL_MODEL_WEIGHT * overall_auc) + ((1 - OVERALL_MODEL_WEIGHT) * bias_score)
    


In [34]:
SUBGROUP_AUC = 'subgroup_auc'
BPSN_AUC = 'bpsn_auc'  # stands for background positive, subgroup negative
BNSP_AUC = 'bnsp_auc'

MODEL_NAME = 'prediction_prob_1'
TOXICITY_COLUMN = 'target'

#log_bias_metrics_df_train = compute_bias_metrics_for_model(train_df, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
#log_final_metric_train = get_final_metric(log_bias_metrics_df_train, calculate_overall_auc(train_df, MODEL_NAME))

nn_bias_metrics_df_test = compute_bias_metrics_for_model(test_results, identity_columns, MODEL_NAME, TOXICITY_COLUMN)
nn_final_metric_test = get_final_metric(nn_bias_metrics_df_test, calculate_overall_auc(test_results, MODEL_NAME))

In [36]:
nn_bias_metrics_df_test

Unnamed: 0,subgroup,subgroup_size,subgroup_auc,bpsn_auc,bnsp_auc
2,homosexual_gay_or_lesbian,1065,0.821304,0.837734,0.964198
6,black,1519,0.834304,0.81044,0.975257
5,muslim,2040,0.847776,0.87006,0.96273
7,white,2452,0.851929,0.827171,0.976014
4,jewish,835,0.898686,0.912396,0.961414
0,male,4386,0.909858,0.918311,0.961982
8,psychiatric_or_mental_illness,511,0.915572,0.908674,0.966953
1,female,5155,0.92329,0.930168,0.962232
3,christian,4226,0.92823,0.949325,0.951141


In [35]:
nn_final_metric_test

0.9204220165385741

In terms of the final bias metric and overall accuracy the results of our LSTM model are very encouraging. In terms of the final weighted AUC, we can see that the model performed significantly better than our classical ML models. Looking at the specific bias subgroups we can see that the model did not particularly struggle with any particular identity group. 

While this is a good result in terms of our stated aim of reducing bias, we are interested to see the impact of adjusting for the existing class imbalance on our model performances. Especially in terms of precision and recall for toxic comments. 

------

### Inference:

Below we have defined a method to use for inference once we have a trained model. Before this we need to have loaded a trained model.

In [89]:
model = tf.keras.models.load_model('saved_baseline_LSTM/')

In [91]:
model.build()

TypeError: build() missing 1 required positional argument: 'input_shape'

In [88]:
model2 = load_model('saved_baseline_LSTM/baseline-LSTM.h5')

ValueError: No model found in config file.

In [83]:
def inference(model, text, tokenizer):
    text = text_padder(text, tokenizer)
    prediction = model.predict(text)[:,1]
    #prediction = np.where(prediction>0.5,1,0)
    
    return prediction
    
    

In [66]:
tokenizer = train_tokenizer(train_data[TRAIN_TEXT_COL], MAX_VOCAB_SIZE)

In [73]:
tokenizer.texts_to_sequences(string_1)

[[8488],
 [949],
 [7305],
 [7305],
 [3482],
 [],
 [2595],
 [3482],
 [8187],
 [7305],
 [284]]

In [84]:
%%time
string_1 = 'Fuck you, you slimy white ballbag'
prediction_1 = inference(model, string_1, tokenizer)

IndexError: tuple index out of range

In [82]:
prediction_1

array([0.00014937, 0.00352895, 0.00458008, 0.00082791, 0.00518829,
       0.00042009, 0.00028408, 0.00352895, 0.00419235, 0.00518829,
       0.00042009, 0.00028408, 0.00352895, 0.00518829, 0.00135115,
       0.00730738, 0.01587632, 0.00224975, 0.00042009, 0.00518829,
       0.00064892, 0.00150058, 0.01587632, 0.00092003, 0.00041714,
       0.00518829, 0.00077131, 0.01062131, 0.00730738, 0.00730738,
       0.00077131, 0.01062128, 0.00085398], dtype=float32)