# Install Libraries

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!pip install fasttext
!pip install -q -U "tensorflow-text==2.11.*"
!pip install stanza
!pip install contractions

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import keras
tf.get_logger().setLevel('ERROR')
import nltk
from  nltk.tokenize  import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import stanza
import contractions
import fasttext
import fasttext.util

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
fasttext.util.download_model('en', if_exists='ignore')

# Data Preparing and Preprocessing

In [None]:
def read_data(path, columns_list):
  dataset = pd.read_excel(path)
  dataset = dataset[columns_list]
  dataset.head()
  return dataset

In [None]:
def prepare_data(dataset, features_list, target, target_classes):
  for index, record in dataset.iterrows():
    for i in range(len(target_classes)):
      if(record[target] == target_classes[i]):
        record[target] = i
  X = dataset[features_list[0]].tolist()
  y = dataset[target].tolist()
  return X,y

In [None]:
def get_one_hot_vectors(labels):
  one_hot_labels = []
  for label in labels:
    for i in unique(labels):
      if (label == i):
        lis = len(unique(labels)) * [0]
        lis[label] = 1
        one_hot_labels.append(lis)
  return one_hot_labels

In [None]:
def whitespace_tokenizer(sent):
  return sent.split()

In [None]:
def tokenize_data(data):
  nlp = stanza.Pipeline(lang='en', processors='tokenize')
  tokenized_data = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    doc = [str(token.text) for sent in doc.sentences for token in sent.tokens]
    doc = ' '.join(doc)
    tokenized_data.append(doc)
  return tokenized_data

In [None]:
def expand_data(data):
  expanded_text = []
  for sent in data:
    expanded_words = []
    for word in sent.split():
      expanded_words.append(contractions.fix(word))
    expanded_text.append(' '.join(expanded_words))
  return(expanded_text)

In [None]:
def get_pos_tags(data):
  nlp = stanza.Pipeline(lang='en', processors='pos, tokenize')
  POS_tags = []
  for i in range(0, len(data)):
    doc = data[i]
    doc = nlp(doc)
    tags= [str(word.pos) for sent in doc.sentences for word in sent.words]
    POS_tags.append(tags)
  return POS_tags

In [None]:
def slang_transformer(reviews):
  abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk",
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart",
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
     "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet",
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously",
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
  }
  for i, review in enumerate(reviews):
    for j, word in enumerate(review.split()):
      if word.lower() in abbreviations.keys():
        reviews[i].replace(word, abbreviations[word.lower()])
  return reviews

In [None]:
def preprocess_data(data):
  #data = expand_data(data)
  data = slang_transformer(data)
  #data = spelling_corrector(data)
  #stopwords = nltk.corpus.stopwords.words('english')
  nlp = stanza.Pipeline(lang='en', processors='tokenize,lemma')
  processed_data = []
  for i in range(0, len(data)):
    #doc = re.sub('[^a-zA-Z]', ' ', response[i])
    doc = data[i]
    doc = nlp(doc)
    doc = [str(word.lemma).lower() for sent in doc.sentences for word in sent.words]
    doc = ' '.join(doc)
    processed_data.append(doc)
  return processed_data

In [None]:
def split_train_val_test(X, y, train_ratio, test_ratio, random_state = None):
  train_ds, test_ds, train_labels, test_labels = train_test_split(X, y, test_size=test_ratio, random_state=42, shuffle = True, stratify = y)
  val_ratio = 1- (train_ratio + test_ratio)
  train_ds, val_ds,  train_labels , val_labels = train_test_split(train_ds, train_labels, test_size= val_ratio, random_state=42, stratify = train_labels)
  return train_ds, train_labels, val_ds, val_labels, test_ds, test_labels

# Helper Functions

In [None]:
def calculate_idf(processed_data):
    count_vectorizer = CountVectorizer(tokenizer=whitespace_tokenizer, token_pattern = None, lowercase=False)
    count_vectorizer.fit(processed_data)
    tf_matrix = count_vectorizer.transform(processed_data)
    doc_freq = np.array(tf_matrix.astype(bool).sum(axis=0)).flatten()
    idf = np.log(len(processed_data) / (doc_freq))
    return idf.tolist(), count_vectorizer

In [None]:
def unique(list1):
  unique_list = []
  for x in list1:
      if x not in unique_list:
          unique_list.append(x)
  return unique_list

In [None]:
def get_pos_weights(POS_tags, weights = [1, 1, 0.5]):
  weights = []
  for sentence_tags in POS_tags:
    sentence_weights = []
    for tag in sentence_tags:
      if(tag == "VERB"):
        sentence_weights.append(weights[0])
      elif(tag == "NOUN"):
        sentence_weights.append(weights[1])
      else:
        sentence_weights.append(weights[2])
    weights.append(sentence_weights)
  return weights

In [None]:
def get_idf_weights(preprocessed_reviews):
  idf_vectors, vectorizer = calculate_idf(preprocessed_reviews)
  weights = {}
  for name, vector in zip(vectorizer.get_feature_names_out(), idf_vectors):
    weights.update({name : vector})
  return weights

# Deep Learning Models Training and Evaluation Settings

In [None]:
def compile_model(classifier_model, loss = tf.keras.losses.CategoricalCrossentropy(), metrics = [tf.metrics.Recall(), tf.metrics.Precision()]):
  classifier_model = build_classifier_model()
  classifier_model.compile(optimizer="adam",
                         loss=loss,
                         metrics=metrics)
  return classifier_model

In [None]:
def prepare_callbacks(names = ['early_stop', 'reduceLR']):
  callbacks = []
  if('early_stop' in names):
   earlyStopping = tf.keras.callbacks.EarlyStopping(
     monitor="val_loss",
     min_delta = 0.0001,
     patience = 20,
     verbose=1,
     mode="min",
     restore_best_weights=True)
   callbacks.append(earlyStopping)
  if('reduceLR' in names):
   reduceLR = tf.keras.callbacks.ReduceLROnPlateau(
       monitor="val_loss",
       factor=0.5,
       patience=10,
       verbose=1,
       mode="min",
       min_delta=0.0001,
       cooldown=0,
       min_lr=0)
   callbacks.append(reduceLR)
  return callbacks

In [None]:
def train_model(model, train_ds, tr_labels, val_ds, val_labels, epochs = 50, batch_size = 32, callbacks = []):
  history = model.fit(x= tf.constant(train_ds),
                               y = tf.constant(tr_labels),
                               validation_data = (val_ds, tf.constant(val_labels)),
                               epochs=epochs,batch_size=batch_size, callbacks = callbacks)

In [None]:
def evaluate_model(classifier_model, test_data, test_labels):
  classifier_model.evaluate(test_data, tf.constant(test_labels))

# Deep Learning Models Architecture

In [None]:
def build_NN_model(shape = shape):
  input = tf.keras.layers.Input(shape=(shape))
  net = tf.keras.layers.Dropout(0.1)(input)
  net = tf.keras.layers.Dense(len(classes), activation='softmax', name='classifier')(net)
  return tf.keras.Model(input, net)

In [None]:
def build_finetuned_model(trainable = False):
  input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='sentences')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable = trainable, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  embedding_vectors = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(embedding_vectors)
  net = tf.keras.layers.Dense(len(classes), activation='softmax', name='classifier')(net)
  return tf.keras.Model(input, net)

# Get Text Representaions

In [None]:
def get_fastText_model(model_type = "domain"):
  if (model_type == "domain" ):
    ft_model = fasttext.load_model("/gdrive/MyDrive/Project/train_100_single_epoch50.bin")
  elif (model_type = "300_dim"):
    ft_model = fasttext.load_model('cc.en.300.bin')
  return ft_model

In [None]:
def get_fastText_embedding(data):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    words_count = 0
    for word in review.lower().split():
      words_count = words_count + 1
      word_embedding = ft_model.get_word_vector(word).astype('float32')
      review_embedding = review_embedding + word_embedding
    review_embedding = review_embedding/words_count
    embeddings[i] = review_embedding
  return embeddings

In [None]:
def get_idf_weighted_fastText_embedding(data, weights):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    words_count = 0
    for word in whitespace_tokenizer(review):
      words_count = words_count + 1
      word_embedding = ft_model.get_word_vector(word).astype('float32') * weights[word]
      review_embedding = review_embedding + word_embedding
    if(words_count == 0):
      continue
    review_embedding = review_embedding/words_count
    embeddings[i] = review_embedding
  return embeddings

In [None]:
def get_pos_weighted_fastText_embedding(data, weights):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension()), dtype = 'float32')
  for i, review in enumerate(data):
    review_embedding = np.zeros(shape=(ft_model.get_dimension(),), dtype = 'float32')
    words_count = 0
    for j, word in enumerate(review.split()):
      if(weights[i][j] == 0):
        continue
      words_count = words_count + 1
      word_embedding = ft_model.get_word_vector(word).astype('float32') * weights[i][j]
      review_embedding = review_embedding + word_embedding
    if(words_count != 0):
      review_embedding = review_embedding/words_count
    embeddings[i] = review_embedding
  return embeddings

In [None]:
def get_pos_concatenated_fastText_embedding(data, pos_tags):
  embeddings = np.zeros(shape=(len(data), ft_model.get_dimension() * 2), dtype = 'float32')
  for i, review in enumerate(data):
    nouns_review_embedding = np.zeros(shape=(ft_model.get_dimension()), dtype = 'float32')
    verbs_review_embedding = np.zeros(shape=(ft_model.get_dimension()), dtype = 'float32')
    nouns_words_count = 0
    verbs_words_count = 0
    for j, word in enumerate(review.split()):
      if(pos_tags[i][j] == "NOUN"):
        nouns_words_count = nouns_words_count + 1
        word_embedding = ft_model.get_word_vector(word).astype('float32')
        nouns_review_embedding = nouns_review_embedding + word_embedding
      elif(pos_tags[i][j] == "VERB"):
        verbs_words_count = verbs_words_count + 1
        word_embedding = ft_model.get_word_vector(word).astype('float32')
        verbs_review_embedding = verbs_review_embedding + word_embedding
    if(nouns_words_count != 0):
      nouns_review_embedding = nouns_review_embedding/nouns_words_count
    if(verbs_words_count != 0):
      verbs_review_embedding = verbs_review_embedding/verbs_words_count
    embeddings[i] = np.concatenate([nouns_review_embedding, verbs_review_embedding])
  return embeddings

In [None]:
def get_BoW_vectors(processed_data):
  CountVec = CountVectorizer(ngram_range=(1,1))
  vectors = CountVec.fit_transform(processed_data)
  vectors = vectors.toarray()
  for i, vector in enumerate(vectors):
   for j, elem in enumerate(vector):
     if(vectors[i][j]> 1):
       vectors[i][j] = 1
  return vectors

In [None]:
def get_TF_vectors(processed_data):
  CountVec = CountVectorizer(ngram_range=(1,1))
  vectors = CountVec.fit_transform(processed_data)
  vectors = vectors.toarray()
  return vectors

In [None]:
def get_idf_vectors(processed_data):
  vectorizer = TfidfVectorizer()
  vectors = vectorizer.fit_transform(processed_data)
  vectors = vectors.toarray()
  return vectors

In [None]:
def prepare_BERT_embedding_model():
  bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'
  map_name_to_handle = {
      'bert_en_uncased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
      'bert_en_cased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
      'bert_multi_cased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
      'small_bert/bert_en_uncased_L-2_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-2_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-2_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-2_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
      'small_bert/bert_en_uncased_L-4_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-4_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-4_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-4_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
      'small_bert/bert_en_uncased_L-6_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-6_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-6_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-6_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
      'small_bert/bert_en_uncased_L-8_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-8_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-8_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-8_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
      'small_bert/bert_en_uncased_L-10_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-10_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-10_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-10_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
      'small_bert/bert_en_uncased_L-12_H-128_A-2':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
      'small_bert/bert_en_uncased_L-12_H-256_A-4':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
      'small_bert/bert_en_uncased_L-12_H-512_A-8':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
      'small_bert/bert_en_uncased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
      'albert_en_base':
          'https://tfhub.dev/tensorflow/albert_en_base/2',
      'electra_small':
          'https://tfhub.dev/google/electra_small/2',
      'electra_base':
          'https://tfhub.dev/google/electra_base/2',
      'experts_pubmed':
          'https://tfhub.dev/google/experts/bert/pubmed/2',
      'experts_wiki_books':
          'https://tfhub.dev/google/experts/bert/wiki_books/2',
      'talking-heads_base':
          'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
  }
  map_model_to_preprocess = {
      'bert_en_uncased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'bert_en_cased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
      'small_bert/bert_en_uncased_L-2_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-2_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-2_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-2_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-4_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-4_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-4_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-4_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-6_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-6_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-6_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-6_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-8_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-8_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-8_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-8_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-10_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-10_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-10_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-10_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-12_H-128_A-2':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-12_H-256_A-4':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-12_H-512_A-8':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'small_bert/bert_en_uncased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'bert_multi_cased_L-12_H-768_A-12':
          'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
      'albert_en_base':
          'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
      'electra_small':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'electra_base':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'experts_pubmed':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'experts_wiki_books':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
      'talking-heads_base':
          'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
  }
  tfhub_handle_encoder = map_name_to_handle[bert_model_name]
  tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]
  print(f'BERT model selected           : {tfhub_handle_encoder}')
  print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')
  return tfhub_handle_encoder, tfhub_handle_preprocess

In [None]:
def get_BERT_embeddings(data):
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(data)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  embedding_vectors = outputs['pooled_output']
  return embedding_vectors

# Train and Test

## Setup

In [None]:
# change dataset_name, features_list, target according to the dataset
dataset_name = "P1-Golden.xlsx"
projected_columns_list = ["Reviews", "Judgment"]
target = "Judgment"
features_list = ["Reviews"]
dataset = read_data(dataset_name, projected_columns_list)
classes = unique(dataset[target].tolist())
reviews, labels = prepare_data(dataset, features_list, target , classes)

# Fast Text Representaions
# 300 dim
ft_model = get_fastText_model(model_type = "300_dim")

# domain
ft_model = get_fastText_model(model_type = "domain")

  # Average
  reviews = get_fastText_embedding(reviews)

  # idf-weighted
  weights = get_idf_weights(reviews)
  reviews = get_idf_weighted_fastText_embedding(reviews, weights)

  # pos-weighted
  pos_tags = get_pos_tags(reviews)
  reviews = tokenize_data(reviews)
  weights = get_pos_weights(pos_tags)
  reviews = get_pos_weighted_fastText_embedding(reviews, weights)

  # concat
  pos_tags = get_pos_tags(reviews)
  reviews = tokenize_data(reviews)
  reviews = get_pos_concatenated_fastText_embedding(reviews, pos_tags)

# BoW-binary Representaion
processed_data = preprocess_date(reviews)
reviews = get_BOW_vectors(processed_data)

# TF Representaion
processed_data = preprocess_date(reviews)
reviews = get_tf_vectors(processed_data)

# Tf-IDF Representaion
processed_data = preprocess_date(reviews)
reviews = get_idf_vectors(processed_data)

# BERT CLS Representaion
tfhub_handle_encoder, tfhub_handle_preprocess = prepare_BERT_embedding_model()
reviews = get_BERT_embeddings(reviews)

# Fine tune BERT
model = build_finetuned_model(trainble = True)

train_ds, train_labels, val_ds, val_labels, test_ds, test_labels = split_train_val_test(reviews, labels, train_ratio = 0.7, test_ratio = 0.15, random_state = 42)
train_labels = get_one_hot_vectors(train_labels)
one_test_labels = get_one_hot_vectors(test_labels)
val_labels = get_one_hot_vectors(val_labels)

## Train

In [None]:
# NN Model
model = build_classifier_model()
model = compile_model(model)
callbacks = prepare_callbacks()
train_model(model, train_ds, train_labels,val_ds, val_labels, epochs = 1000, batch_size = 32, callbacks = callbacks)

# Fine-tuned Model
train_model(model, train_ds, train_labels,val_ds, val_labels, epochs = 1000, batch_size = 32, callbacks = callbacks)

# SVM without hyperparameters tuning
model = SVC(C = 0.1, coef0 = 10,  degree = 2, gamma = 0.1, kernel = 'poly', random_state = 42) # change these parameters as you want
model.fit(train_ds, train_labels)

# SVM with hyperparameters tuning
svm = SVC(random_state=  42)
model = GridSearchCV(svm, param_grid={ 'gamma' : [0.001, 0.01, 0.1, 1, 10], 'C' : [0.01, 0.1, 1, 10, 100],
                                   'kernel':['poly', 'rbf', 'linear'], 'degree': [2, 3, 4], 'coef0' : [0, 0.1, 1.0, 10]},
                      cv= skf, scoring = 'f1_weighted', verbose = 2)
model.fit(np.array(train_ds),train_labels)

## Test

In [None]:
# NN Model
# Fine-tuned Model
evaluate_model(model, test_ds, get_one_hot_vectors(test_labels))
predictions = model.predict(test_ds)
prediction_labels = []
for prediction in predictions:
  prediction_labels.append(np.argmax(prediction))

# SVM
# NN Model
# Fine-tuned Model
print(classification_report(test_labels, prediction_labels))

# SVN with Grid Search
print(classification_report(test_labels, gd.predict(np.array(test_ds))))
print(gd.best_params_)