In [None]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow import keras

from sklearn.model_selection import train_test_split
from sklearn import model_selection, datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import joblib
import pickle

import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("dataset/data_500_label_Oversampled.csv")
# premade datasets:
# "dataset/data_500_label_SMOTEsampled.csv"
# "dataset/data_500_label_Oversampled.csv"
# "dataset/data_500_label_Undersampled.csv"
# "dataset/data_500_label_Unbalanced.csv"

df.head(5)

In [None]:
df.groupby('label').describe()

In [None]:
#split data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(df['caption'],df['score'], stratify=df['score'])

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

def get_sentence_embedding(sentences):
    """
    returns word embeddings from given sentences
    """
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
"""
code used from:
https://github.com/codebasics/deep-learning-keras-tf-tutorial/blob/master/47_BERT_text_classification/BERT_email_classification-handle-imbalance.ipynb
"""
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

model.summary()

In [None]:
#train model
model.fit(X_train, y_train, epochs=10)

In [None]:
model.evaluate(X_test, y_test)

In [None]:
#save the trained weights of a model
model.save_weights('models/file-name.ckpt') 

In [None]:
loaded_model = model
loaded_model.load_weights('models/filename.ckpt')

#trained models:
#   model_SMOTE
#   model_Oversampled
#   model_Undersampled
#   model_Unbalanced
#   unfortunately the .ckpt files are to large for github  

In [None]:
y_predicted = loaded_model.predict(X_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)
print(classification_report(y_test, y_predicted))