# Pipeline to retrieve books from predicted date of a query text reranked by similarity

In [1]:
#imports

import pandas as pd
pd.options.mode.chained_assignment = None 
import torch
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gensim
from gensim.models.doc2vec import TaggedDocument

  from .autonotebook import tqdm as notebook_tqdm


#### Read the dataset from our preprocessed json file

In [2]:
df = pd.read_json('gutenberg-dataset-v2.json')

#### Selecting our samples from the dataset and split it into train and test subsets

In [9]:
df = df[df['date'] >= 1820]
df = df[df['date'] <= 1920]

earliest_date = df['date'].min()
latest_date = df['date'].max()

samples_list = []
for year in range(earliest_date, latest_date + 1):
    df_year = df[df['date'] == year]
    if len(df_year) > 100:
        samples = df_year.sample(100, random_state=42)
        samples_list.append(samples)
    else:
        samples_list.append(df_year)

df_samples = pd.concat(samples_list)
df_samples['labels'] = df_samples['date'].apply(lambda date: np.interp(date, [earliest_date, latest_date], [-1, 1]))

# Split data into train and test subsets
df_train, df_test = train_test_split(df_samples, test_size=0.2, random_state=42, shuffle=True)
df_test = df_test.head(2)
df_test

Unnamed: 0,title,author,date,text_ratio,text,text_len_characters,weights,labels
18261,The Piazza Tales,Herman Melville,1856,0.962312,huskily continued don benito painfully turning...,11497,0.025641,-0.28
38208,The Spell of Egypt,Robert Hichens,1911,0.970044,in the statue she is presented to us as a lime...,10691,0.071429,0.82


#### Setup and load the pretrained classification model

In [4]:
cuda_available = torch.cuda.is_available()
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.regression = True

# Create a ClassificationModel
bert_model = ClassificationModel(
    'bert',
    'bert-transformer/bert-base-historic-english-cased/outputs', # Load our own pre-trained model
    num_labels=1,
    args=model_args,
    use_cuda=cuda_available
)

#### Set up and train the Doc2Vec model for the re-ranking

In [5]:
def train_and_vectorize_docs(train_df, vector_size=40, min_count=4, epochs=30):
    """
    Train a Doc2Vec model on a given DataFrame containing textual data and date information.

    Parameters:
    - train_df (pandas.DataFrame): Input DataFrame with 'text' and 'date' columns.
    - vector_size (int): Dimensionality of the document vectors in the Doc2Vec model.
    - min_count (int): Ignores all words with a total frequency lower than this.
    - epochs (int): Number of iterations over the entire dataset during training.

    Returns:
    - pandas.DataFrame: DataFrame containing vectors, original text, date ranges, and other information.
    """

    # Tokenization of each snippet
    train_df['tokenized_text'] = train_df['text'].apply(word_tokenize)

    new_df = train_df.groupby('title')['tokenized_text'].agg(lambda x: sum(x, [])).reset_index()# Grouping books else we have multile instaces of same book in our rerank
    # Preptaring the data for the Doc2Vec model
    train_corpus = []
    for i, words in enumerate(new_df['tokenized_text']):
        tagged_doc = TaggedDocument(words=words, tags=[str(i)])
        train_corpus.append(tagged_doc)

    # Train the Doc2Vec model
    model = gensim.models.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # Infer Vectors and save in column (maybe usfull if we want to use them for further processing)
    vectors = [model.infer_vector(words) for words in new_df['tokenized_text']]
    new_df['vectors'] = vectors

    return new_df, model

def get_most_similar_books(query, model, df_trained, top_results = 10):
    df_query = pd.DataFrame({'text': [query]})
    df_query['tokenized_text'] = df_query['text'].apply(word_tokenize)

    vector = model.infer_vector(df_query['tokenized_text'].iloc[0])
    similar_docs = model.dv.most_similar(vector, topn=top_results)

    data = {'similarity': [], 'title': []} #, 'author': [], 'date': []}
    for doc in similar_docs:
        data['title'].append(df_trained.iloc[int(doc[0])]['title'])
        #data['author'].append(df_trained.iloc[int(doc[0])]['author'])
        #data['date'].append(df_trained.iloc[int(doc[0])]['date'])
        data['similarity'].append(round(doc[1], 3))

    return pd.DataFrame(data)

#### Helper methods

In [6]:
def get_date_from_prediction(prediction):
    return int(np.interp(prediction, [-1, 1], [earliest_date, latest_date]))

#### Create predictions and retrieve book of the predicted year, then re-rank them by similarity to the query

In [10]:
query_texts = df_test['text'].tolist()
query_real_dates = df_test['date'].tolist()
predictions, raw_outputs = bert_model.predict(query_texts)
prediction_dates = [get_date_from_prediction(pred) for pred in predictions]

for query, pred_date in zip(query_texts, prediction_dates):
    df_books_from_pred_date = df[df['date'] == pred_date]    # Includes up to 1500 samples

    df_trained, doc2vec_model = train_and_vectorize_docs(df_books_from_pred_date)
    similar_docs = get_most_similar_books(query, doc2vec_model, df_trained, top_results=10)

    print(f"Most similar to query \"{query[:100]}...\":")
    similar_docs.apply(lambda doc: print(f"Similarity: {doc['similarity']} | Title: {doc['title']} by | Publish Date: "), axis=1)


  0%|          | 0/2 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 50%|█████     | 1/2 [00:04<00:04,  4.56s/it]
100%|██████████| 1/1 [00:01<00:00,  1.23s/it]


Most similar to query "huskily continued don benito painfully turning in the half embrace of his servant i have to thank th...":
Similarity: 0.624 | Title: Jack Harkaway in New York; or, The Adventures of the Travelers' Club by | Publish Date: 
Similarity: 0.609 | Title: Harper's Young People, November 11, 1879 / An Illustrated Weekly by | Publish Date: 
Similarity: 0.585 | Title: Harper's Young People, November 18, 1879 / An Illustrated Weekly by | Publish Date: 
Similarity: 0.57 | Title: The Royal Regiment, and Other Novelettes by | Publish Date: 
Similarity: 0.537 | Title: The Campaigns of the British Army at Washington and New Orleans 1814-1815 by | Publish Date: 
Similarity: 0.509 | Title: Andersonville: A Story of Rebel Military Prisons — Volume 2 by | Publish Date: 
Similarity: 0.476 | Title: Andersonville: A Story of Rebel Military Prisons — Volume 4 by | Publish Date: 
Similarity: 0.454 | Title: Harper's Young People, November 4, 1879 / An Illustrated Weekly by | Publish Date: