In [None]:
# Sentence transformers for sentence embeddings :: https://github.com/UKPLab/sentence-transformers
% pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/cc/75/df441011cd1726822b70fbff50042adb4860e9327b99b346154ead704c44/sentence-transformers-1.2.0.tar.gz (81kB)
[K     |████████████████████████████████| 81kB 6.9MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 27.3MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 26.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)

In [None]:
import pandas as pd
import numpy as np
import string
import spacy
import nltk
import re
from nltk.corpus import stopwords
from tqdm import tqdm as tq

## Modelling
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report , confusion_matrix , accuracy_score
from sklearn.model_selection import train_test_split

## Pre-trained models for sentence embeddings
from sentence_transformers import SentenceTransformer

# nltk.download('stopwords')
# nltk.download('gutenberg')
# nltk.download('punkt')
# stop_words = stopwords.words('english')

In [None]:
books_df = pd.read_csv('gutenberg_books_partitions.csv')

In [None]:
sentences = books_df.partition.values

## Label Encoder for target values 
label_enc = LabelEncoder()
labels = label_enc.fit_transform(books_df.book_name.values)

In [None]:
def load_embeddings(embeddings_path , sentences):
  """ 
     Load pre-trained embeddings models to embed and vectorize sentences
  """
  ## Use word embeddings to extract the average sentence embeddings
  model = SentenceTransformer(embeddings_path)
  sentence_embeddings = model.encode(sentences)
  print("Shape of sentences after embeddings ::")
  print(sentence_embeddings.shape)

  ## Splitting data into train/test for modelling
  return sentence_embeddings

In [None]:
def train_clf(sentence_embeddings, labels, estimator='svm'):
  X_train, X_test, y_train, y_test = train_test_split(sentence_embeddings, labels, 
                                                      test_size=0.2, random_state=0)
  
  if estimator=='svm':
    model = SVC()
    model.fit(X_train, y_train)

  train_prediction = model.predict(X_train)
  prediction = model.predict(X_test)
  print("=========================")
  print("\n\nEmbeddings")
  print("Train Accuracy : ", accuracy_score(y_train,train_prediction)*100)
  print("Test Accuracy : ", accuracy_score(y_test,prediction)*100)

  print("\n\t\t TEST DATA METRICS")
  print(confusion_matrix(y_test, prediction))
  print(classification_report(y_test, prediction))

## Glove Pre-trained
- Load the SentenceTransformer to load the pretrained Glove model for sentence embeddings

In [None]:
## Use the Glove word embeddings to extract the average sentence embeddings
embeddings_path = 'average_word_embeddings_glove.6B.300d'

sentence_embeddings = load_embeddings(embeddings_path, sentences)

train_clf(sentence_embeddings, labels, 'svm')

HBox(children=(FloatProgress(value=0.0, max=441493586.0), HTML(value='')))


Shape of sentences after embeddings ::
(995, 300)


Embeddings
Train Accuracy :  96.98492462311557
Test Accuracy :  83.41708542713567

		 TEST DATA METRICS
[[40  5  1  0  1]
 [ 4 33  2  0  4]
 [ 0  1 23  0  1]
 [ 0  0  3 34  3]
 [ 1  5  2  0 36]]
              precision    recall  f1-score   support

           0       0.89      0.85      0.87        47
           1       0.75      0.77      0.76        43
           2       0.74      0.92      0.82        25
           3       1.00      0.85      0.92        40
           4       0.80      0.82      0.81        44

    accuracy                           0.83       199
   macro avg       0.84      0.84      0.84       199
weighted avg       0.84      0.83      0.84       199



## RoBERTa Pre-trained
- Load the SentenceTransformer to load the pretrained RoBERTa model for sentence embeddings a transformer based model using the BERT architecture for language models

In [None]:
## Use the RoBERTa word embeddings to extract the average sentence embeddings
embeddings_path = 'stsb-roberta-base-v2'

sentence_embeddings = load_embeddings(embeddings_path, sentences)

train_clf(sentence_embeddings, labels, 'svm')

HBox(children=(FloatProgress(value=0.0, max=459724146.0), HTML(value='')))


Shape of sentences after embeddings ::
(995, 768)


Embeddings
Train Accuracy :  99.2462311557789
Test Accuracy :  84.92462311557789

		 TEST DATA METRICS
[[45  1  0  0  1]
 [ 4 31  4  1  3]
 [ 0  0 24  0  1]
 [ 1  0  4 33  2]
 [ 2  2  2  2 36]]
              precision    recall  f1-score   support

           0       0.87      0.96      0.91        47
           1       0.91      0.72      0.81        43
           2       0.71      0.96      0.81        25
           3       0.92      0.82      0.87        40
           4       0.84      0.82      0.83        44

    accuracy                           0.85       199
   macro avg       0.85      0.86      0.84       199
weighted avg       0.86      0.85      0.85       199

