# Pipeline to retrieve books from predicted date of a query text reranked by similarity

In [8]:
#imports

import pandas as pd
pd.options.mode.chained_assignment = None 
import torch
import numpy as np
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import gensim
from gensim.models.doc2vec import TaggedDocument

#### Read the dataset from our preprocessed json file

In [9]:
df = pd.read_json('gutenberg-dataset-v2.json')

#### Selecting our samples from the dataset and split it into train and test subsets

In [28]:
# We only want to use the books that have a date between 1820 and 1920
df = df[df['date'] >= 1820]
df = df[df['date'] <= 1920]

earliest_date = df['date'].min()
latest_date = df['date'].max()

# Split data into train and test subsets
df_train, df_test = train_test_split(df, test_size=0.1, random_state=42, shuffle=True)

df_test = df_test.head(5)
df_test

Unnamed: 0,title,author,date,text_ratio,text,text_len_characters,weights
172381,Voces Populi,F. Anstey,1892,0.930760,known to some of you i dare say as the throstl...,10407,0.058824
209579,"The Heart of the White Mountains, Their Legend...",Samuel Adams Drake,1882,0.965637,at this point and passes over a long stretch o...,11967,0.016949
188083,The Ceramic Art / A Compendium of The History ...,Jennie J. Young,1878,0.951185,in edinburgh scotland and first worked in the ...,12002,0.013889
81657,At Good Old Siwash,George Fitch,1916,0.970252,sitting on the front porch and guarding their ...,10394,0.027778
158316,The Lion's Brood,Duffield Osborne,1904,0.964153,compel duty a look of cunning crossed his face...,10979,0.034483
...,...,...,...,...,...,...,...
149787,The Song of Hiawatha: An Epic Poem,Henry Wadsworth Longfellow,1898,0.946441,arrows only paused to rest beneath a pinetree ...,11125,0.055556
78923,Klytia: A Story of Heidelberg Castle,Adolf Hausrath,1883,0.970222,whether the greek father of the gods was about...,10909,0.020833
8358,A Handbook of the Boer War / With General Map ...,,1910,0.964687,it showed the capture of an armoured train on ...,11568,0.018868
273262,"Samantha Among the Colored Folks: ""My Ideas on...",Marietta Holley,1894,0.961732,on with that same plaintive sweet song and it ...,10246,0.023256


#### Setup and load the pretrained classification model

In [11]:
cuda_available = torch.cuda.is_available()
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.regression = True

# Create a ClassificationModel
bert_model = ClassificationModel(
    'bert',
    'bert-transformer/bert-base-historic-english-cased/outputs', # Load our own pre-trained model
    num_labels=1,
    args=model_args,
    use_cuda=cuda_available
)

#### Set up and train the Doc2Vec model for the re-ranking

In [12]:
def train_and_vectorize_docs(train_df, vector_size=40, min_count=4, epochs=30):
    """
    Train a Doc2Vec model on a given DataFrame containing textual data and date information.

    Parameters:
    - train_df (pandas.DataFrame): Input DataFrame with 'text' and 'date' columns.
    - vector_size (int): Dimensionality of the document vectors in the Doc2Vec model.
    - min_count (int): Ignores all words with a total frequency lower than this.
    - epochs (int): Number of iterations over the entire dataset during training.

    Returns:
    - pandas.DataFrame: DataFrame containing vectors, original text, date ranges, and other information.
    """

    # Tokenization of each snippet
    train_df['tokenized_text'] = train_df['text'].apply(word_tokenize)

    new_df = train_df.groupby('title')['tokenized_text'].agg(lambda x: sum(x, [])).reset_index()# Grouping books else we have multile instaces of same book in our rerank
    # Preptaring the data for the Doc2Vec model
    train_corpus = []
    for i, words in enumerate(new_df['tokenized_text']):
        tagged_doc = TaggedDocument(words=words, tags=[str(i)])
        train_corpus.append(tagged_doc)

    # Train the Doc2Vec model
    model = gensim.models.Doc2Vec(vector_size=vector_size, min_count=min_count, epochs=epochs)
    model.build_vocab(train_corpus)
    model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

    # Infer Vectors and save in column (maybe usfull if we want to use them for further processing)
    vectors = [model.infer_vector(words) for words in new_df['tokenized_text']]
    new_df['vectors'] = vectors

    return new_df, model

def get_most_similar_books(query, model, df_trained, top_results = 10):
    """
    Retrieves the most similar books to a given query using a trained model.

    Parameters:
    query (str): The query text.
    model (gensim.models.doc2vec.Doc2Vec): The trained Doc2Vec model.
    df_trained (pandas.DataFrame): The DataFrame containing the trained data.
    top_results (int, optional): The number of top results to retrieve. Defaults to 10.

    Returns:
    pandas.DataFrame: A DataFrame containing the most similar books with their similarity scores, titles, authors, and dates.
    """
    df_query = pd.DataFrame({'text': [query]})
    df_query['tokenized_text'] = df_query['text'].apply(word_tokenize)

    vector = model.infer_vector(df_query['tokenized_text'].iloc[0])
    similar_docs = model.dv.most_similar(vector, topn=top_results)

    data = {'similarity': [], 'title': [], 'author': [], 'date': []}
    for doc in similar_docs:
        data['title'].append(df_trained.iloc[int(doc[0])]['title'])
        data['author'].append(df_train[df_train['title'] == df_trained.iloc[int(doc[0])]['title']]['author'].iloc[0])
        data['date'].append(df_train[df_train['title'] == df_trained.iloc[int(doc[0])]['title']]['date'].iloc[0])
        data['similarity'].append(round(doc[1], 3))

    return pd.DataFrame(data)

#### Helper methods

In [13]:
def get_date_from_prediction(prediction):
    """
    Convert a prediction value to a date.

    Parameters:
    - prediction (float): Prediction value between -1 and 1.

    Returns:
    - int: Date (year between earliest_date and latest_date)
    """
    return int(np.interp(prediction, [-1, 1], [earliest_date, latest_date]))

#### Create predictions and retrieve book of the predicted year, then re-rank them by similarity to the query

In [16]:
query_texts = df_test['text'].tolist()
query_real_dates = df_test['date'].tolist()
# Get predictions for query texts from the BERT model
predictions, raw_outputs = bert_model.predict(query_texts)
# Convert predictions back to dates
prediction_dates = [get_date_from_prediction(pred) for pred in predictions]

# Get at least 10 books for each query and pass them to the Doc2Vec model
for query, pred_date in zip(query_texts, prediction_dates):
    df_books_pred = df_train[df_train['date'] == pred_date]
    distance_from_pred_date = 1
    while len(df_books_pred['title'].unique()) < 10:
        # Add books from the previous and following years until we have at least 10 books
        df_books_pred = pd.concat([df_books_pred, df_train[df_train['date'] == (pred_date + distance_from_pred_date)]])
        df_books_pred = pd.concat([df_books_pred, df_train[df_train['date'] == (pred_date - distance_from_pred_date)]])
        distance_from_pred_date += 1

    # Train the Doc2Vec model and get a list of the most similar books
    df_trained, doc2vec_model = train_and_vectorize_docs(df_books_pred)
    similar_docs = get_most_similar_books(query, doc2vec_model, df_trained, top_results=10)

    print(f"Most similar to query \"{query[:100]}...\":")
    similar_docs.apply(lambda doc: print(f"Similarity: {doc['similarity']} | Title: {doc['title']} |",
                                         f"Author: {doc['author']} | Publish Date: {doc['date']}"), axis=1)

  0%|          | 0/5 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 20%|██        | 1/5 [00:04<00:17,  4.34s/it]
100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
