In [1]:
import requests
import re
import pandas as pd
import random
from string import ascii_lowercase
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from urllib.request import urlopen
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.utils import shuffle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Preprocessing

In [2]:
# Download NLTK stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text: remove punctuation, stop words, and non-textual elements
# def clean_text(text):
#     # Remove punctuation
#     text = re.sub(r'[^\w\s]', '', text)
#     # Split into words
#     words = text.split()
#     # Remove stop words and non-textual elements
#     cleaned_words = [word for word in words if word.lower() not in stop_words]
#     # strip underscores
#     cleaned_words = [word.strip('_') for word in cleaned_words]
#     # remove numbers
#     cleaned_words = [word for word in cleaned_words if not word.isnumeric()]
#     # remove words that start with numbers
#     cleaned_words = [word for word in cleaned_words if not word[0].isnumeric()]
#     return ' '.join(cleaned_words)

# Function to create the partitions
def split_into_fixed_partitions(text, partitions=200, words_per_partition=100):
    words = text.split()
    partitions = [words[i:i + 100] for i in range(0, len(words), 100)]
    random_partitions = random.sample(partitions, min(200, len(partitions)))

    return random_partitions

def find_start_end(text):
    # Find the start and end of the main text
    start_pattern = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"
    end_pattern = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .+ \*\*\*"

    start_match = re.search(start_pattern, text)
    end_match = re.search(end_pattern, text)

    start_idx = start_match.end() if start_match else 0
    end_idx = end_match.start() if end_match else len(text)

    return text[start_idx:end_idx]

# def process_book(url, label):
#     # Download the book text from the URL
#     response = requests.get(url)
#     response.encoding = 'utf-8'
#     text = response.text

#     # Extract the main text between start and end markers
#     main_text = find_start_end(text)

#     # Clean the main text
#     cleaned_text = clean_text(main_text)

#     # Extract words from the cleaned text
#     words = cleaned_text.split()

#     # Split words into partitions of 100 and take 200 random partitions
#     partitions = [words[i:i + 100] for i in range(0, len(words), 100)]
#     random_partitions = random.sample(partitions, min(200, len(partitions)))

#     return [(label, ' '.join(partition)) for partition in random_partitions]

def process_book(url):
  raw = urlopen(url).read()

  # convert raw format to string and fetch author name of the book
  string_txt = str(raw)
  author_name_match = re.search(r'Author:?[\w\s;:,\'\"]+',string_txt)
  tokens = word_tokenize(string_txt)
  author_name = author_name_match.group(0)[7:len(author_name_match.group(0))]

  #Remove stop words from the text
  stwrd = stopwords.words('english')
  cleaned_string = [word for word in tokens if word.lower() not in stwrd and word.isalpha()]
  cleaned_string = ' '.join(cleaned_string)

  #removing special characters and escape sequences
  cleaned_string = re.sub(r'\\[^,:;]+|[^\x20-\x7E]+','',cleaned_string)
  new_string = find_start_end(cleaned_string)

  #Split the text into partitions
  partitions = split_into_fixed_partitions(cleaned_string, partitions=200, words_per_partition=100)
  partitions = [{"Words" : ' '.join(part), "Label" : author_name} for part in partitions]
  return partitions

# Updated list of Gutenberg book URLs (Same as before, no change needed here)
# book_urls = [
#     'https://www.gutenberg.org/files/1342/1342-0.txt',  # Pride and Prejudice by Jane Austen
#     'https://www.gutenberg.org/files/768/768-0.txt',    # Wuthering Heights by Emily Brontë
#     'https://www.gutenberg.org/files/1260/1260-0.txt',  # Jane Eyre by Charlotte Brontë
#     'https://www.gutenberg.org/files/1400/1400-0.txt',  # Great Expectations by Charles Dickens
#     'https://www.gutenberg.org/files/145/145-0.txt',    # Middlemarch by George Eliot
#     'https://www.gutenberg.org/files/541/541.txt'       # The Age of Innocence by Edith Wharton
# ]

# book_authors = ["Jane Austen", "Emily Bronte", "Charlotte Bronte", "Charles Dickens",  "George Eliot", "Edith Wharton"]
book_urls = [
    'https://www.gutenberg.org/cache/epub/1342/pg1342.txt',  # Pride and Prejudice by Jane Austen
    'https://www.gutenberg.org/cache/epub/768/pg768.txt',    # Wuthering Heights by Emily Brontë
    'https://www.gutenberg.org/cache/epub/1260/pg1260.txt',    # Jane Eyre: An Autobiography by Charlotte Brontë
    'https://www.gutenberg.org/cache/epub/1400/pg1400.txt',    # Great Expectations Charles Dickens
    'https://www.gutenberg.org/cache/epub/145/pg145.txt',      # Middlemarch by George Eliot
    'https://www.gutenberg.org/cache/epub/541/pg541.txt'       # The Age of Innocence by Edith Wharton
]


# Process all books
all_partitions = []

# for url, label in zip(book_urls, book_authors):
for url in book_urls:
    book_partitions = process_book(url)
    all_partitions.extend(book_partitions)

# Convert to DataFrame
partition_df = pd.DataFrame(all_partitions)

# Serialize DataFrame to CSV
partition_df.to_csv('book_partitions_cleaned.csv', index=False)

partition_df = shuffle(partition_df, random_state=69)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
partition_df.head()

Unnamed: 0,Words,Label
543,seated proud grace piano snowy robes queenly a...,Charlotte Bront
1149,wife mean asked indistinct still looked transp...,Edith Wharton
653,would something days formed plan outline besto...,Charles Dickens
934,said wished marry man loved marry Lydgate seve...,George Eliot
668,one called said voice darkness floor want top ...,Charles Dickens


# Bag of Words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer()

In [5]:
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from sklearn.multiclass import OneVsRestClassifier

In [6]:
model_objs = [
    OneVsRestClassifier(SVC(kernel="rbf")),
    RandomForestClassifier(),
    OneVsRestClassifier(GaussianNB()),
    KNeighborsClassifier(),
    OneVsRestClassifier(SGDClassifier()),
    DecisionTreeClassifier(),
    OneVsRestClassifier(AdaBoostClassifier()),
    OneVsRestClassifier(XGBClassifier(random_state=69))
    ]


model_names = [
    "Gaussian SVC",
    "RandomForestClassifier",
    "Naive Bayes",
    "KNeighborsClassifier",
    "SGDClassifier",
    "DecisionTreeClassifier",
    "AdaBoostClassifier",
    "XGBClassifier",
]

In [8]:
partition_df.shape

(1200, 2)

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse=False)

# Lists to store metric values for each model
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

for m_obj, m_name in zip(model_objs, model_names):
  acc_sum = 0
  precision_sum = 0
  recall_sum = 0
  f1_sum = 0
  roc_auc_sum = 0
  for train_index,test_index in kf.split(partition_df):
    train_data = partition_df.iloc[train_index]
    test_data = partition_df.iloc[test_index]
    X_train = train_data.drop(['Label'], axis=1)
    y_train = train_data['Label']
    X_test = test_data.drop(['Label'], axis=1)
    y_test = test_data['Label']
    # One-hot encoding for the label
    y_train_onehot = onehot_encoder.fit_transform(train_data[['Label']])
    y_test_onehot = onehot_encoder.transform(test_data[['Label']])

    y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
    y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

    X_train_bow = bow_vectorizer.fit_transform(X_train['Words'])
    X_test_bow = bow_vectorizer.transform(X_test['Words'])
    X_train_bow = pd.DataFrame(X_train_bow.toarray(), columns=bow_vectorizer.get_feature_names_out())
    X_test_bow = pd.DataFrame(X_test_bow.toarray(), columns=bow_vectorizer.get_feature_names_out())

    m_obj.fit(X_train_bow, y_train_df),
    y_pred = m_obj.predict(X_test_bow)
    acc = accuracy_score(y_pred, y_test_df)
    precision = precision_score(y_pred, y_test_df, average='macro')
    recall = recall_score(y_pred, y_test_df, average='macro')
    f1 = f1_score(y_pred, y_test_df, average='macro')
    roc_auc = roc_auc_score(y_test_df, y_pred,  average='macro')

    acc_sum+=acc
    precision_sum += precision
    recall_sum += recall
    f1_sum += f1
    roc_auc_sum += roc_auc

  # Calculate average metric values for the model
  avg_acc = acc_sum / 10
  avg_precision = precision_sum / 10
  avg_recall = recall_sum / 10
  avg_f1 = f1_sum / 10
  avg_roc_auc = roc_auc_sum / 10

  # Append the average metric values to the respective lists
  accuracy_list.append(avg_acc)
  precision_list.append(avg_precision)
  recall_list.append(avg_recall)
  f1_list.append(avg_f1)
  roc_auc_list.append(avg_roc_auc)


  print("MODEL: {}".format(m_name))
  print("Average Accuracy: {}".format(acc_sum/10))
  print("Average Precision: {}".format(precision_sum/10))
  print("Average Recall: {}".format(recall_sum/10))
  print("Average F1: {}".format(f1_sum/10))
  print("Average ROC AUC: {}".format(roc_auc_sum/10))
  print()


# Plotting the graph
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

plt.figure(figsize=(10, 8))
plt.subplots_adjust(bottom=0.3)

for i, metric_list in enumerate([accuracy_list, precision_list, recall_list, f1_list, roc_auc_list]):
    plt.plot(model_names, metric_list, label=metrics[i])


plt.title('Model Performance Comparison (Bag of Words)')
plt.xlabel('Models')

# Rotate model names vertically
plt.xticks(rotation=90, ha='center')

plt.ylabel('Metric Value')
plt.legend()
plt.show()

# TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

In [None]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse=False)

# Lists to store metric values for each model
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []


for m_obj, m_name in zip(model_objs, model_names):
  acc_sum = 0
  precision_sum = 0
  recall_sum = 0
  f1_sum = 0
  roc_auc_sum = 0
  for train_index,test_index in kf.split(partition_df):
    train_data = partition_df.iloc[train_index]
    test_data = partition_df.iloc[test_index]
    X_train = train_data.drop(['Label'], axis=1)
    y_train = train_data['Label']
    X_test = test_data.drop(['Label'], axis=1)
    y_test = test_data['Label']
    # One-hot encoding for the label
    y_train_onehot = onehot_encoder.fit_transform(train_data[['Label']])
    y_test_onehot = onehot_encoder.transform(test_data[['Label']])

    y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
    y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['Words'])
    X_test_tfidf = tfidf_vectorizer.transform(X_test['Words'])
    X_train_tfidf = pd.DataFrame(X_train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    X_test_tfidf = pd.DataFrame(X_test_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    m_obj.fit(X_train_tfidf, y_train_df),
    y_pred = m_obj.predict(X_test_tfidf)
    acc = accuracy_score(y_pred, y_test_df)
    precision = precision_score(y_pred, y_test_df, average='macro')
    recall = recall_score(y_pred, y_test_df, average='macro')
    f1 = f1_score(y_pred, y_test_df, average='macro')
    roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

    acc_sum+=acc
    precision_sum += precision
    recall_sum += recall
    f1_sum += f1
    roc_auc_sum += roc_auc

  # Calculate average metric values for the model
  avg_acc = acc_sum / 10
  avg_precision = precision_sum / 10
  avg_recall = recall_sum / 10
  avg_f1 = f1_sum / 10
  avg_roc_auc = roc_auc_sum / 10

  # Append the average metric values to the respective lists
  accuracy_list.append(avg_acc)
  precision_list.append(avg_precision)
  recall_list.append(avg_recall)
  f1_list.append(avg_f1)
  roc_auc_list.append(avg_roc_auc)

  print("MODEL: {}".format(m_name))
  print("Average Accuracy: {}".format(acc_sum/10))
  print("Average Precision: {}".format(precision_sum/10))
  print("Average Recall: {}".format(recall_sum/10))
  print("Average F1: {}".format(f1_sum/10))
  print("Average ROC AUC: {}".format(roc_auc_sum/10))
  print()

  # Plotting the graph
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

plt.figure(figsize=(10, 8))
plt.subplots_adjust(bottom=0.3)

for i, metric_list in enumerate([accuracy_list, precision_list, recall_list, f1_list, roc_auc_list]):
    plt.plot(model_names, metric_list, label=metrics[i])


plt.title('Model Performance Comparison (TF IDF)')
plt.xlabel('Models')

# Rotate model names vertically
plt.xticks(rotation=90, ha='center')

plt.ylabel('Metric Value')
plt.legend()
plt.show()

# Word embedding

In [None]:
import numpy as np

glove_file_path= 'glove.6B.50d.txt'

# Function to load GloVe embeddings into a dictionary
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Load GloVe embeddings into the dictionary
glove_embeddings = load_glove_embeddings(glove_file_path)

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Assuming you have defined 'model' and 'names' somewhere in your code

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse=False)

# Lists to store metric values for each model
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

for m_obj, m_name in zip(model_objs, model_names):
    acc_sum = 0
    precision_sum = 0
    recall_sum = 0
    f1_sum = 0
    roc_auc_sum = 0

    for train_index, test_index in kf.split(partition_df):
        train_data = partition_df.iloc[train_index]
        test_data = partition_df.iloc[test_index]
        X_train = train_data['Words']
        y_train = train_data['Label']
        X_test = test_data['Words']
        y_test = train_data['Label']

        # Create document embeddings using GloVe
        X_train_glove = np.array([np.mean([glove_embeddings.get(word, np.zeros(50)) for word in sentence.split()], axis=0) for sentence in X_train])
        X_test_glove = np.array([np.mean([glove_embeddings.get(word, np.zeros(50)) for word in sentence.split()], axis=0) for sentence in X_test])

        # One-hot encoding for the label
        y_train_onehot = onehot_encoder.fit_transform(train_data[['Label']])
        y_test_onehot = onehot_encoder.transform(test_data[['Label']])
        y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
        y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

        #Model Training and Prediction
        m_obj.fit(X_train_glove, y_train_df)
        y_pred = m_obj.predict(X_test_glove)

        # Compute Evaluation Metrics
        acc = accuracy_score(y_pred, y_test_df)
        precision = precision_score(y_pred, y_test_df, average='macro')
        recall = recall_score(y_pred, y_test_df, average='macro')
        f1 = f1_score(y_pred, y_test_df, average='macro')
        roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

        acc_sum += acc
        precision_sum += precision
        recall_sum += recall
        f1_sum += f1
        roc_auc_sum += roc_auc

    # Calculate average metric values for the model
    avg_acc = acc_sum / 10
    avg_precision = precision_sum / 10
    avg_recall = recall_sum / 10
    avg_f1 = f1_sum / 10
    avg_roc_auc = roc_auc_sum / 10

    # Append the average metric values to the respective lists
    accuracy_list.append(avg_acc)
    precision_list.append(avg_precision)
    recall_list.append(avg_recall)
    f1_list.append(avg_f1)
    roc_auc_list.append(avg_roc_auc)

    print("MODEL: {}".format(m_name))
    print("Average Accuracy: {}".format(avg_acc))
    print("Average Precision: {}".format(avg_precision))
    print("Average Recall: {}".format(avg_recall))
    print("Average F1: {}".format(avg_f1))
    print("Average ROC AUC: {}".format(avg_roc_auc))
    print()

# Plotting the graph
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

plt.figure(figsize=(10, 8))
plt.subplots_adjust(bottom=0.3)

for i, metric_list in enumerate([accuracy_list, precision_list, recall_list, f1_list, roc_auc_list]):
    plt.plot(model_names, metric_list, label=metrics[i])


plt.title('Model Performance Comparison')
plt.xlabel('Models')

# Rotate model names vertically
plt.xticks(rotation=90, ha='center')

plt.ylabel('Metric Value')
plt.legend()
plt.show()

# N-Gram


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

kf = KFold(n_splits=10, shuffle=True, random_state=69)
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first')

# Lists to store metric values for each model
accuracy_list = []
precision_list = []
recall_list = []
f1_list = []
roc_auc_list = []

for m_obj, m_name in zip(model_objs, model_names):
  acc_sum = 0
  precision_sum = 0
  recall_sum = 0
  f1_sum = 0
  roc_auc_sum = 0
  for train_index,test_index in kf.split(partition_df):
    train_set, test_set = partition_df.iloc[train_index], partition_df.iloc[test_index]

    X_train = train_set.drop(['Label'], axis=1)
    y_train = train_set['Label']
    X_test = test_set.drop(['Label'], axis=1)
    y_test = test_set['Label']

    min_n, max_n = 1, 2  # Example: Trying n-grams from 1 to 2
    ngram_range_values = [(i, j) for i in range(min_n, max_n + 1) for j in range(i, max_n + 1)]

    # One-hot encoding for the label
    y_train_onehot = onehot_encoder.fit_transform(train_set[['Label']])
    y_test_onehot = onehot_encoder.transform(test_set[['Label']])


    y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
    y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

    max_acc,max_precision,max_recall,max_f1,max_roc_auc = [-1,-1,-1,-1,-1]

    for ngrams in ngram_range_values:
      vec = CountVectorizer(ngram_range = ngrams)
      X_train_vec = vec.fit_transform(X_train['Words'])
      X_train_vec = pd.DataFrame(X_train_vec.toarray(), columns=vec.get_feature_names_out())
      X_test_vec = vec.transform(X_test['Words'])
      X_test_vec = pd.DataFrame(X_test_vec.toarray(), columns=vec.get_feature_names_out())
      m_obj.fit(X_train_vec,y_train_df)
      y_pred = m_obj.predict(X_test_vec)
      acc = accuracy_score(y_pred, y_test_df)
      precision = precision_score(y_pred, y_test_df, average='macro')
      recall = recall_score(y_pred, y_test_df, average='macro')
      f1 = f1_score(y_test_df, y_pred, average='macro')
      roc_auc = roc_auc_score(y_test_df, y_pred, average='macro')

      max_acc = max(max_acc,acc)
      max_precision = max(max_precision,precision)
      max_recall = max(max_recall,recall)
      max_f1 = max(max_f1,f1)
      max_roc_auc = max(max_roc_auc,roc_auc)

    acc_sum+=max_acc
    precision_sum += max_precision
    recall_sum += max_recall
    f1_sum += max_f1
    roc_auc_sum += max_roc_auc

  # Append the average metric values to the respective lists
  accuracy_list.append(acc_sum/10)
  precision_list.append(precision_sum/10)
  recall_list.append(recall_sum/10)
  f1_list.append(f1_sum/10)
  roc_auc_list.append(roc_auc_sum/10)

  print("MODEL: {}".format(m_name))
  print("Average Accuracy: {}".format(acc_sum/10))
  print("Average Precision: {}".format(precision_sum/10))
  print("Average Recall: {}".format(recall_sum/10))
  print("Average F1: {}".format(f1_sum/10))
  print("Average ROC AUC: {}".format(roc_auc_sum/10))
  print()

# Plotting the graph
metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'ROC AUC']

plt.figure(figsize=(10, 8))
plt.subplots_adjust(bottom=0.3)

for i, metric_list in enumerate([accuracy_list, precision_list, recall_list, f1_list, roc_auc_list]):
    plt.plot(model_names, metric_list, label=metrics[i])


plt.title('Model Performance Comparison')
plt.xlabel('Models')

# Rotate model names vertically
plt.xticks(rotation=90, ha='center')

plt.ylabel('Metric Value')
plt.legend()
plt.show()

# BERT

In [None]:
!pip install tensorflow_text

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [None]:
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

In [None]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dense(128, activation='relu', name='hidden_1')(net)
  net = tf.keras.layers.Dropout(0.3)(net)
  net = tf.keras.layers.Dense(6, activation='softmax', name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(bert_raw_result)

In [None]:
tf.keras.utils.plot_model(classifier_model)

In [None]:
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.metrics.CategoricalAccuracy()

In [None]:
X = partition_df.drop(['Label'], axis=1)
y = partition_df['Label']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import warnings
import numpy as np
warnings.filterwarnings('ignore')

onehot_encoder = OneHotEncoder(sparse=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_onehot = onehot_encoder.fit_transform(np.array(y_train).reshape(-1,1))
y_test_onehot = onehot_encoder.transform(np.array(y_test).reshape(-1,1))

y_train_df = pd.DataFrame(y_train_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))
y_test_df = pd.DataFrame(y_test_onehot, columns=onehot_encoder.get_feature_names_out(['Label']))

In [None]:
y_train_df

In [None]:
epochs = 10
optimizer = tf.keras.optimizers.Adam(0.0001)

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
classifier_model.summary()

In [None]:
history = classifier_model.fit(X_train,y_train_df, batch_size=128, epochs=epochs, validation_data=(X_test, y_test_df))

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
y_pred = to_categorical(np.argmax(classifier_model.predict(X_test), axis=-1))

In [None]:
y_pred

In [None]:
acc = accuracy_score(y_pred, y_test_df)
precision = precision_score(y_pred, y_test_onehot, average='macro')
recall = recall_score(y_pred, y_test_onehot, average='macro')
f1 = f1_score(y_pred, y_test_onehot, average='macro')
roc_auc = roc_auc_score(y_test_onehot, y_pred, average='macro')

In [None]:
print("Average Accuracy: {}".format(acc))
print("Average Precision: {}".format(precision))
print("Average Recall: {}".format(recall))
print("Average F1: {}".format(f1))
print("Average ROC AUC: {}".format(roc_auc))

In [None]:
classifier_model.save('BERT_classifier.h5')

# Error analysis: BERT