In [19]:
import pandas as pd 
import numpy as np
import json 
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout, Conv1D, MaxPool1D, GlobalMaxPool1D, Embedding, Activation
from tensorflow import keras
import tensorflow as tf 
from keras.callbacks import EarlyStopping
import tensorflow_hub as hub 
import tensorflow_text as text
from tensorflow.keras import optimizers

In [5]:
df = pd.read_csv('IBC.csv', index_col=False)
df.SENTENCE = df.SENTENCE.apply(gensim.utils.simple_preprocess)

In [4]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=6 ,
)

In [6]:
model.build_vocab(df.SENTENCE, progress_per=1000)


In [7]:
model.train(df.SENTENCE, total_examples=model.corpus_count, epochs=model.epochs)

(591406, 790965)

In [8]:
model.save("./IBC_Vectors.model")

In [23]:
token = Tokenizer(7229)
token.fit_on_texts(df['SENTENCE'])
text = token.texts_to_sequences(df['SENTENCE'])
text = pad_sequences(text, 200)
print(text[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 1566  180  114  107    3  875  460  233    4    1  260    4
    25 3643   15  122    7  661  796    2 1567  395   61   24 13

In [24]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(df['LABEL'])
y = keras.utils.to_categorical(y)
y[:2]

array([[0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [25]:
x_train, x_test, y_train, y_test = train_test_split(text, y, test_size=0.2, stratify=y)

In [27]:
def gensim_to_keras_embedding(model, train_embeddings=False):
    """Get a Keras 'Embedding' layer with weights set from Word2Vec model's learned word embeddings.

    Parameters
    ----------
    train_embeddings : bool
        If False, the returned weights are frozen and stopped from being updated.
        If True, the weights can / will be further updated in Keras.

    Returns
    -------
    `keras.layers.Embedding`
        Embedding layer, to be used as input to deeper network layers.

    """
    keyed_vectors = model.wv  # structure holding the result of training
    weights = keyed_vectors.vectors  # vectors themselves, a 2D numpy array    
    index_to_key = keyed_vectors.index_to_key  # which row in `weights` corresponds to which word?

    layer = Embedding(
        input_dim=weights.shape[0],
        output_dim=weights.shape[1],
        weights=[weights],
        trainable=train_embeddings,
    )
    return layer

In [33]:
es = EarlyStopping(
    monitor='accuracy',
    patience=5,
    min_delta = 0.1,
    mode='max'
)

In [35]:
keras_model = tf.keras.Sequential()
keras_model.add(gensim_to_keras_embedding(model, train_embeddings=True))
keras_model.add(tf.keras.layers.SpatialDropout1D(0.2))
keras_model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
keras_model.add(tf.keras.layers.Dense(3, activation='softmax'))

METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

learning_rate = 0.0001
optimizer = optimizers.Adam(learning_rate)
    
keras_model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=METRICS)

keras_model.fit(x_train, y_train, batch_size=64, epochs=30, verbose=1, callbacks=[es])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30


<keras.callbacks.History at 0x2dbdcb77070>