In [1]:
from transformers import XLNetTokenizer, XLNetModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json

file_path = 'new-headline-train.json'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = json.load(file)
    
# Load pre-trained XLNet tokenizer and model
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# model = XLNetModel.from_pretrained('xlnet-base-cased')

In [2]:
import json
import pandas as pd


In [3]:
data = pd.DataFrame.from_dict(dataset)

data.head()

Unnamed: 0,id,input,profile
0,300,Generate a headline for the following article:...,[{'text': 'Homeowners sell their homes and buy...
1,301,Generate a headline for the following article:...,[{'text': 'This article summarizes the dysfunc...
2,302,Generate a headline for the following article:...,[{'text': 'Homeowners sell their homes and buy...
3,303,Generate a headline for the following article:...,[{'text': 'While the attractions and bathing s...
4,304,Generate a headline for the following article:...,[{'text': 'While the attractions and bathing s...


In [4]:
query_start = 'Generate a headline for the following article: '
query = dataset[1]['input'][len(query_start):]
print(query)
print(dataset[1]['id'])

Homeowners sell their homes and buy other homes for a variety of reasons including a need to live closer to a place of employment, to be closer to family, to enjoy a better climate, or simply to upgrade. This article is about finding the best sequence of steps in the process.
301


In [5]:
documents = []
for doc in dataset[1]['profile']:
    documents.append(doc['text'])
print(len(documents))

15


In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

In [7]:
def paraphrase_t5(
    input_sentence,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=500
):
    input_ids = tokenizer(
        f'paraphrase: {input_sentence}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    rephrased_versions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return rephrased_versions

# Example usage
input_sentence = query
rephrased_versions = paraphrase_t5(input_sentence)


for i, rephrased_sentence in enumerate(rephrased_versions, start=1):
    print(f"Rephrased Version {i}: {rephrased_sentence}")
    print("-------------------------------------------")



Rephrased Version 1: Homeowners choose to sell their homes and purchase other homes for various reasons such as proximity to work, closeness to family, climate, or upgrades. This article will explore the best way to proceed.
-------------------------------------------
Rephrased Version 2: Many homeowners opt to sell their homes and purchase other houses for various reasons such as proximity to work, closeness to family, climate, or upgrades. This article will discuss the optimal sequence of steps in this process.
-------------------------------------------
Rephrased Version 3: Those who own homes purchase other houses to move or live closer to their families, better climate, or upgrade. The purpose of this article is to determine the appropriate sequence of steps.
-------------------------------------------
Rephrased Version 4: The reason why homeowners opt to sell their homes and buy other homes is due to various reasons, such as being closer than their work or family. They may also b

In [8]:
# Load pre-trained XLNet tokenizer and model
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model_xlnet = XLNetModel.from_pretrained('xlnet-base-cased')

In [9]:
user_profile = dataset[3]["profile"]
print(len(user_profile))

171


In [10]:
document_embedding = []
# Process each document in the first user profile
for profile in user_profile:
    # Tokenize and extract embeddings for each document
    tokenized_document = tokenizer_xlnet(profile["text"], return_tensors='pt')
    with torch.no_grad():
        document_outputs = model_xlnet(**tokenized_document)
    # Use the last layer output as the document embedding
    document_embedding.append(document_outputs.last_hidden_state.mean(dim=1))

In [11]:
# Initialize lists to store relevant documents and scores for each version
all_relevant_documents = []
all_relevant_scores = []

In [12]:
for i, input_version in enumerate([input_sentence] + rephrased_versions, start=1):
    # Print the input version
    print(f"\nProcessing Input Version {i}: {input_version}\n{'='*50}")

    # Retrieve relevant documents for the current version
    input_documents = []
    input_scores = []

    # Encode the current query version
    tokenized_paraphrase_rob = tokenizer_xlnet(f'paraphrase: {input_version}', return_tensors='pt')
    with torch.no_grad():
            paraphrase_outputs = model_xlnet(**tokenized_paraphrase_rob)
    paraphrase_embedding = paraphrase_outputs.last_hidden_state.mean(dim=1)

    # Calculate cosine similarity between the query and paraphrase embeddings
    # similarity_scores = util.dot_score(paraphrase_embedding, document_embedding)[0].cpu().tolist()
    similarity = []
    for j in range(len(document_embedding)):
        similarity.append(torch.nn.functional.cosine_similarity(document_embedding[j], paraphrase_embedding.squeeze(dim=1)))
    doc_score_title_pairs = list(zip([doc['text'] for doc in dataset[1]['profile']],
                                     [doc['title'] for doc in dataset[1]['profile']],
                                     similarity))
    
    # Sort by decreasing similarity score
    doc_score_title_pairs = sorted(doc_score_title_pairs, key=lambda x: x[2], reverse=True)

    # Calculate the number of documents to retrieve (top 25%)
    num_documents_to_retrieve = int(0.25 * len(doc_score_title_pairs))

    # Check if the number of documents to retrieve is greater than 15
    if num_documents_to_retrieve > 10:
        num_documents_to_retrieve = 10
        
    for text, title, score in doc_score_title_pairs[:num_documents_to_retrieve]:
        print(f"Original Score: {score}")
        print(f"Title: {title}")
        print(f"Document: {text}\n")
        
        # Store relevant document, title, and score for each interpretation
        input_documents.append({'text': text, 'title': title})
        input_scores.append(score)
        
        print("="*50)
        
    # Store relevant documents and scores for each version
    all_relevant_documents.append(input_documents)
    all_relevant_scores.append(input_scores)


Processing Input Version 1: Homeowners sell their homes and buy other homes for a variety of reasons including a need to live closer to a place of employment, to be closer to family, to enjoy a better climate, or simply to upgrade. This article is about finding the best sequence of steps in the process.
Original Score: tensor([0.9743])
Title: After March 2, Reverse Mortgage Borrowers Will Have to Prove They Are Not Deadbeats
Document: Applicants with plenty of equity in their homes might find that the fully-funded Set-Aside imposes no burden on them at all, in which case the underwriting costs could be avoided. There is no reason why lenders and borrowers should not have that option.

Original Score: tensor([0.9726])
Title: The Tontine: A 17th Century Solution to a 21st Century Problem
Document: The tontine is an investment scheme where each of a group of participants pays a specified sum into a fund and receives a pro rata share of the income generated by the fund, but when a partici

In [13]:
# Calculate average score for each document across different interpretations
average_scores = {}
for documents, scores in zip(all_relevant_documents, all_relevant_scores):
    for doc_dict, score in zip(documents, scores):
        if isinstance(doc_dict, dict):  # Check if it's a dictionary
            doc_text = doc_dict.get('text', '')  # Use 'get' to provide a default value if 'text' is not present
            doc_title = doc_dict.get('title', '')  # Use 'get' to provide a default value if 'title' is not present
            if doc_text:
                if doc_text not in average_scores:
                    average_scores[doc_text] = {'text': doc_text, 'title': doc_title, 'scores': []}
                average_scores[doc_text]['scores'].append(score)

# Calculate average score for each document
average_documents = [{'title': details['title'], 'text': details['text'], 'average_score': sum(details['scores']) / len(details['scores'])}for details in average_scores.values()]

# Sort documents based on average scores
sorted_documents = sorted(average_documents, key=lambda x: x['average_score'], reverse=True)

retrieval_docs = []

# Output top 10 documents based on average scores
print("\nTop  Documents :")
for doc_dict in sorted_documents[:10]:
    new_doc_dict = {'title': doc_dict['title'], 'text': doc_dict['text']}
    retrieval_docs.append(new_doc_dict)
print(retrieval_docs)


Top  Documents :
[{'title': 'After March 2, Reverse Mortgage Borrowers Will Have to Prove They Are Not Deadbeats', 'text': 'Applicants with plenty of equity in their homes might find that the fully-funded Set-Aside imposes no burden on them at all, in which case the underwriting costs could be avoided. There is no reason why lenders and borrowers should not have that option.'}, {'title': 'The Tontine: A 17th Century Solution to a 21st Century Problem', 'text': 'The tontine is an investment scheme where each of a group of participants pays a specified sum into a fund and receives a pro rata share of the income generated by the fund, but when a participant dies their share is divided among those remaining. As the number of participants dwindles, those remaining receive increasingly large distributions.'}, {'title': 'Why and How to Eliminate Mortgage Charges by Third Parties', 'text': 'Third-party settlement costs could be eliminated by implementation of one simple rule: any service requ

In [14]:
r_d = ''
count = 1
for i in retrieval_docs:
  # print(i)
  r_d += str(count) + '. '
  r_d += 'The text is: ' + i['text'] + '\n'
  r_d += 'The title for the above text is: ' + i['title'] + '\n'
  count += 1
  # if count > 7:
  #   break

print(r_d)

1. The text is: Applicants with plenty of equity in their homes might find that the fully-funded Set-Aside imposes no burden on them at all, in which case the underwriting costs could be avoided. There is no reason why lenders and borrowers should not have that option.
The title for the above text is: After March 2, Reverse Mortgage Borrowers Will Have to Prove They Are Not Deadbeats
2. The text is: The tontine is an investment scheme where each of a group of participants pays a specified sum into a fund and receives a pro rata share of the income generated by the fund, but when a participant dies their share is divided among those remaining. As the number of participants dwindles, those remaining receive increasingly large distributions.
The title for the above text is: The Tontine: A 17th Century Solution to a 21st Century Problem
3. The text is: Third-party settlement costs could be eliminated by implementation of one simple rule: any service required by lenders as a condition for t

In [15]:
print(len(retrieval_docs))

4


PALM USING GOOGLE API

In [16]:
pip install -U -q google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
import google.generativeai as palm

import textwrap
import numpy as np
import pandas as pd

In [18]:
palm.configure(api_key='AIzaSyBVo_JbfzrPBpHbueQtOiRozzyFK1QK8D0')

In [19]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, including all relevant background information. I'm providing you with some sample text and title written by me for 7 texts examples.
  Based on the context provided, what's a relevant title for the following news article. Just give titles.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [20]:
passage = r_d

In [21]:
prompt = make_prompt(query, passage)
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, including all relevant background information. I'm providing you with some sample text and title written by me for 7 texts examples.
  Based on the context provided, what's a relevant title for the following news article. Just give titles.
  QUESTION: 'Homeowners sell their homes and buy other homes for a variety of reasons including a need to live closer to a place of employment, to be closer to family, to enjoy a better climate, or simply to upgrade. This article is about finding the best sequence of steps in the process.'
  PASSAGE: '1. The text is: Applicants with plenty of equity in their homes might find that the fully-funded Set-Aside imposes no burden on them at all, in which case the underwriting costs could be avoided. There is no reason why lenders and borrowers should not have that option. The title for the above 

In [22]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]

text_model = text_models[0]

In [23]:
temperature = 0.5
answer = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=3,
                            temperature=temperature,
                            max_output_tokens=1000)

In [24]:
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

Candidate 0: How to Sell Your Home and Buy Another

Candidate 1: The Seven Steps to a Successful Home Sale

Candidate 2: How to Sell Your Home and Buy Another

