# Pipeline to retrieve books from predicted date of a query text reranked by similarity

In [86]:
#imports

import pandas as pd
pd.options.mode.chained_assignment = None 
import torch
import seaborn as sns
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import gensim
from gensim.models.doc2vec import TaggedDocument

#### Read the dataset from our preprocessed json file

In [3]:
df = pd.read_json('gutenberg-dataset-v2.json')

#### Selecting our samples from the dataset and split it into train and test subsets

In [4]:
df = df[df['date'] >= 1820]
df = df[df['date'] <= 1920]

earliest_date = df['date'].min()
latest_date = df['date'].max()

samples_list = []
for year in range(earliest_date, latest_date + 1):
    df_year = df[df['date'] == year]
    if len(df_year) > 100:
        samples = df_year.sample(100, random_state=42)
        samples_list.append(samples)
    else:
        samples_list.append(df_year)

df_samples = pd.concat(samples_list)
df_samples['labels'] = df_samples['date'].apply(lambda date: np.interp(date, [earliest_date, latest_date], [-1, 1]))

# Split data into train and test subsets
df_train, df_test = train_test_split(df_samples, test_size=0.2, random_state=42, shuffle=True)
df_test = df_test.head(10)
df_test

Unnamed: 0,title,author,date,text_ratio,text,text_len_characters,weights,labels
18261,The Piazza Tales,Herman Melville,1856,0.962312,huskily continued don benito painfully turning...,11497,0.025641,-0.28
38208,The Spell of Egypt,Robert Hichens,1911,0.970044,in the statue she is presented to us as a lime...,10691,0.071429,0.82
148107,Asbeïn: From the Life of a Virtuoso,Ossip Schubin,1890,0.960506,an upper story the maid goes happy to be relea...,11165,0.045455,0.4
33405,"Sir Thomas More, or, Colloquies on the Progres...",Robert Southey,1824,0.974706,colloquy ivfeudal slaverygrowth of pauperism t...,11266,0.076923,-0.92
161091,Recollections of the Civil War / With the Lead...,Charles A. Dana,1863,0.96844,drawn the enemys attention to that quarter she...,10991,0.027027,-0.14
58607,The Ridin' Kid from Powder River,Henry Herbert Knibbs,1919,0.953091,that if the posse could see to shoot with such...,10520,0.018519,0.98
211406,"La Ronge Journal, 1823",George Nelson,1823,0.956444,have mentioned first because as you may see i ...,10466,0.035714,-0.94
281414,Sketches in Holland and Scandinavia,Augustus J. C. Hare,1885,0.964674,sculptures and many most grand originals espec...,11528,0.076923,0.3
170847,"Rossa's Recollections, 1838 to 1898 / Childhoo...",Jeremiah O'Donovan Rossa,1838,0.963252,cut so deep a chasm he fell and bit the bloody...,10519,0.020833,-0.64
56695,The Papers and Writings of Abraham Lincoln — V...,Abraham Lincoln,1862,0.957157,divide his force sending part against each of ...,10920,0.020833,-0.16


#### Setup and load the pretrained classification model

In [5]:
cuda_available = torch.cuda.is_available()
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.regression = True

# Create a ClassificationModel
bert_model = ClassificationModel(
    'bert',
    'bert-transformer/bert-base-historic-english-cased/outputs', # Load our own pre-trained model
    num_labels=1,
    args=model_args,
    use_cuda=cuda_available
)

#### Set up and train the Doc2Vec model for the re-ranking

In [69]:
def train_and_vectorize_docs(train_df, vector_size=40, min_count=4, epochs=30):
    """
    Train a Doc2Vec model on a given DataFrame containing textual data and date information.

    Parameters:
    - train_df (pandas.DataFrame): Input DataFrame with 'text' and 'date' columns.
    - vector_size (int): Dimensionality of the document vectors in the Doc2Vec model.
    - min_count (int): Ignores all words with a total frequency lower than this.
    - epochs (int): Number of iterations over the entire dataset during training.

    Returns:
    - pandas.DataFrame: DataFrame containing vectors, original text, date ranges, and other information.
    """

    # Tokenization of each snippet
    train_df['tokenized_text'] = train_df['text'].apply(word_tokenize)

    new_df = train_df.groupby('title')['tokenized_text'].agg(lambda x: sum(x, [])).reset_index()# Grouping books else we have multile instaces of same book in our rerank
    #new_df = train_df
    # Preptaring the data for the
    train_corpus = []
    # Doc2Vec model
    for i, words in enumerate(new_df['tokenized_text']):
        tagged_doc = TaggedDocument(words=words, tags=[str(i)])
        train_corpus.append(tagged_doc)

    # Train the Doc2Vec model
    model = gensim.models.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # Infer Vectors and save in column (maybe usfull if we want to use them for further processing)
    vectors = [model.infer_vector(words) for words in new_df['tokenized_text']]
    new_df['vectors'] = vectors

    return new_df, model

def get_most_similar_books(query, model, top_results = 10):
    df_query = pd.DataFrame({'text': [query]})
    df_query['tokenized_text'] = df_query['text'].apply(word_tokenize)
    
    vector = model.infer_vector(df_query['tokenized_text'].iloc[0])
    
    similar_docs = model.dv.most_similar(vector, topn=top_results)
    return similar_docs

#### Helper methods

In [70]:
def get_date_from_prediction(prediction):
    return int(np.interp(prediction, [-1, 1], [earliest_date, latest_date]))

#### Create predictions and retrieve book of the predicted year, then re-rank them by similarity to the query

In [None]:
query_texts = df_test['text'].tolist()
query_real_dates = df_test['date'].tolist()
predictions, raw_outputs = bert_model.predict(query_texts)
prediction_dates = [get_date_from_prediction(pred) for pred in predictions]

for query, pred_date in zip(query_texts, prediction_dates):
    df_books_from_pred_date = df[df['date'] == pred_date]    # Includes ca 1000 Books
    
    df_trained, doc2vec_model = train_and_vectorize_docs(df_books_from_pred_date)   # Liimit them to 1000 Books
    similar_docs = get_most_similar_books(query, doc2vec_model,top_results = 10)
    
    print("Most similar to",df[df.text==query].title.iloc[-1],":")
    for idx, doc in enumerate(similar_docs):
        print("Similarity",round(doc[1],3),"|   Title: ", df_trained.iloc[int(doc[0])].title)
    

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Most similar to The Piazza Tales :
Similarity 0.584 |   Title:  The Atlantic Monthly, Volume 11, No. 68, June, 1863 / A Magazine of Literature, Art, and Politics
Similarity 0.55 |   Title:  The Continental Monthly, Vol. III, No. V,  May, 1863 / Devoted to Literature and National Policy
Similarity 0.538 |   Title:  Canterbury Pieces
Similarity 0.489 |   Title:  The Little London Directory of 1677 / The oldest printed list of the merchants and bankers of London
Similarity 0.445 |   Title:  The Atlantic Monthly, Volume 12, No. 70, August, 1863 / A Magazine of Literature, Art, and Politics
Similarity 0.442 |   Title:  Handbook to the Severn Valley Railway / Illustrative and Descriptive of Places along the Line from Worcester to Shrewsbury
Similarity 0.441 |   Title:  John Marchmont's Legacy, Volumes 1-3
Similarity 0.433 |   Title:  Three Months in the Southern States, April-June 1863
Similarity 0.425 |   Title:  Evidence as to Man's Place in Nature
Similarity 0.406 |   Title:  War Experien

In [67]:
len(df_books_from_pred_date.title.unique())

37

In [50]:
pred_date

1863

In [93]:
df[df.text==query].title.iloc[-1]

'The Spell of Egypt'