In [2]:
from transformers import XLNetTokenizer, XLNetModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json

file_path = 'new-headline-train.json'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = json.load(file)
    
# Load pre-trained XLNet tokenizer and model
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# model = XLNetModel.from_pretrained('xlnet-base-cased')

In [3]:
import json
import pandas as pd


In [4]:
data = pd.DataFrame.from_dict(dataset)

data.head()

Unnamed: 0,id,input,profile
0,300,Generate a headline for the following article:...,[{'text': 'Homeowners sell their homes and buy...
1,301,Generate a headline for the following article:...,[{'text': 'This article summarizes the dysfunc...
2,302,Generate a headline for the following article:...,[{'text': 'Homeowners sell their homes and buy...
3,303,Generate a headline for the following article:...,[{'text': 'While the attractions and bathing s...
4,304,Generate a headline for the following article:...,[{'text': 'While the attractions and bathing s...


In [5]:
query_start = 'Generate a headline for the following article: '
query = dataset[9]['input'][len(query_start):]
print(query)

Private island dining, Michelin-starred meals, breakfast with elephants -- it's all on offer at these top-rated hotels from our 2013 Gold List.


In [6]:
documents = []
for doc in dataset[9]['profile']:
    documents.append(doc['text'])
print(len(documents))

171


In [7]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

In [8]:
def paraphrase_t5(
    input_sentence,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=500
):
    input_ids = tokenizer(
        f'paraphrase: {input_sentence}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    rephrased_versions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return rephrased_versions

# Example usage
input_sentence = query
rephrased_versions = paraphrase_t5(input_sentence)


for i, rephrased_sentence in enumerate(rephrased_versions, start=1):
    print(f"Rephrased Version {i}: {rephrased_sentence}")
    print("-------------------------------------------")



Rephrased Version 1: These top-rated hotels on our 2013 Gold List provide guests with the opportunity to enjoy exclusive island dining, Michelin-starred meals, and elephant breakfast.
-------------------------------------------
Rephrased Version 2: Featuring Michelin-starred meals, elephant breakfast, and private island dining, these top-tier hotels from our 2013 Gold List are truly exceptional.
-------------------------------------------
Rephrased Version 3: The best hotels on our 2013 Gold List offer guests the opportunity to enjoy exclusive island dining, Michelin-starred meals, and elephant breakfast.
-------------------------------------------
Rephrased Version 4: Our 2013 Gold List hotels boast of their exclusive dining options, including Michelin-starred meals, elephant breakfast, and private island dining.
-------------------------------------------
Rephrased Version 5: These hotels on our 2013 Gold List provide guests with exclusive dining experiences on private islands, Miche

In [9]:
# Load pre-trained XLNet tokenizer and model
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model_xlnet = XLNetModel.from_pretrained('xlnet-base-cased')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [10]:
user_profile = dataset[9]["profile"]
print(len(user_profile))

171


In [11]:
document_embedding = []
# Process each document in the first user profile
for profile in user_profile:
    # Tokenize and extract embeddings for each document
    tokenized_document = tokenizer_xlnet(profile["text"], return_tensors='pt')
    with torch.no_grad():
        document_outputs = model_xlnet(**tokenized_document)
    # Use the last layer output as the document embedding
    document_embedding.append(document_outputs.last_hidden_state.mean(dim=1))

In [12]:
# Initialize lists to store relevant documents and scores for each version
all_relevant_documents = []
all_relevant_scores = []

In [14]:
for i, input_version in enumerate([input_sentence] + rephrased_versions, start=1):
    # Print the input version
    print(f"\nProcessing Input Version {i}: {input_version}\n{'='*50}")

    # Retrieve relevant documents for the current version
    input_documents = []
    input_scores = []

    # Encode the current query version
    tokenized_paraphrase_rob = tokenizer_xlnet(f'paraphrase: {input_version}', return_tensors='pt')
    with torch.no_grad():
            paraphrase_outputs = model_xlnet(**tokenized_paraphrase_rob)
    paraphrase_embedding = paraphrase_outputs.last_hidden_state.mean(dim=1)

    # Calculate cosine similarity between the query and paraphrase embeddings
    # similarity_scores = util.dot_score(paraphrase_embedding, document_embedding)[0].cpu().tolist()
    similarity = []
    for j in range(len(document_embedding)):
        similarity.append(torch.nn.functional.cosine_similarity(document_embedding[j], paraphrase_embedding.squeeze(dim=1)))
    doc_score_title_pairs = list(zip([doc['text'] for doc in dataset[9]['profile']],
                                     [doc['title'] for doc in dataset[9]['profile']],
                                     similarity))
    
    # Sort by decreasing similarity score
    doc_score_title_pairs = sorted(doc_score_title_pairs, key=lambda x: x[2], reverse=True)

    # Calculate the number of documents to retrieve (top 25%)
    num_documents_to_retrieve = int(0.25 * len(doc_score_title_pairs))

    # Check if the number of documents to retrieve is greater than 15
    if num_documents_to_retrieve > 10:
        num_documents_to_retrieve = 10
        
    for text, title, score in doc_score_title_pairs[:num_documents_to_retrieve]:
        print(f"Original Score: {score}")
        print(f"Title: {title}")
        print(f"Document: {text}\n")
        
        # Store relevant document, title, and score for each interpretation
        input_documents.append({'text': text, 'title': title})
        input_scores.append(score)
        
        print("="*50)
        
    # Store relevant documents and scores for each version
    all_relevant_documents.append(input_documents)
    all_relevant_scores.append(input_scores)


Processing Input Version 1: Private island dining, Michelin-starred meals, breakfast with elephants -- it's all on offer at these top-rated hotels from our 2013 Gold List.
Original Score: tensor([0.9791])
Title: Vegas' Mind-Blowingly Expensive Hotel Suites
Document: From $40,000 suites with remote-controlled toilets, to 6,000-square-foot duplexes that are "price upon request," these are the most expensive suites at Las Vegas' top hotels.

Original Score: tensor([0.9774])
Title: The Ridiculous Hotel Package That Comes With a Diamond Ring
Document: We're always intrigued by over-the-top luxury -- $1,954 hotel dinners, $500 spiked milkshakes -- but this latest offering by The New York Palace Hotel takes it to a whole other level.

Original Score: tensor([0.9739])
Title: For Your Daydreaming Pleasure: 7 Spas With Unbelievable Views
Document: All of these spa hotels and resorts are highly rated by our readers -- and all offer transporting natural surroundings to make your stay feel like a 

In [18]:
# Calculate average score for each document across different interpretations
average_scores = {}
for documents, scores in zip(all_relevant_documents, all_relevant_scores):
    for doc_dict, score in zip(documents, scores):
        if isinstance(doc_dict, dict):  # Check if it's a dictionary
            doc_text = doc_dict.get('text', '')  # Use 'get' to provide a default value if 'text' is not present
            doc_title = doc_dict.get('title', '')  # Use 'get' to provide a default value if 'title' is not present
            if doc_text:
                if doc_text not in average_scores:
                    average_scores[doc_text] = {'text': doc_text, 'title': doc_title, 'scores': []}
                average_scores[doc_text]['scores'].append(score)

# Calculate average score for each document
average_documents = [{'title': details['title'], 'text': details['text'], 'average_score': sum(details['scores']) / len(details['scores'])}for details in average_scores.values()]

# Sort documents based on average scores
sorted_documents = sorted(average_documents, key=lambda x: x['average_score'], reverse=True)

retrieval_docs = []

# Output top 10 documents based on average scores
print("\nTop  Documents :")
for doc_dict in sorted_documents[:10]:
    new_doc_dict = {'title': doc_dict['title'], 'text': doc_dict['text']}
    retrieval_docs.append(new_doc_dict)
print(retrieval_docs)


Top  Documents :
[{'title': 'Forget White Sand Beaches, These Shores Are Pink, Black, Even Green.', 'text': '-- By Laura Ratliff, Condé Nast Traveler Sure, white sand and crystalline waters are great, but beaches can come in all shades'}, {'title': 'For Your Daydreaming Pleasure: 7 Spas With Unbelievable Views', 'text': 'All of these spa hotels and resorts are highly rated by our readers -- and all offer transporting natural surroundings to make your stay feel like a true escape.'}, {'title': 'Top 10 Small Cruise Ships In The World', 'text': 'Condé Nast Traveler readers voted in our 2012 Cruise Poll and here are the results: the best small ships in the world.'}, {'title': '9 Restaurants With Secret Service Entrances', 'text': "-- Kate Parham, Condé Nast Traveler With the White House, more than 160 embassies, and Congress all in our nation's capitol"}, {'title': "Vegas' Mind-Blowingly Expensive Hotel Suites", 'text': 'From $40,000 suites with remote-controlled toilets, to 6,000-square-

In [19]:
r_d = ''
count = 1
for i in retrieval_docs:
  # print(i)
  r_d += str(count) + '. '
  r_d += 'The text is: ' + i['text'] + '\n'
  r_d += 'The title for the above text is: ' + i['title'] + '\n'
  count += 1
  # if count > 7:
  #   break

print(r_d)

1. The text is: -- By Laura Ratliff, Condé Nast Traveler Sure, white sand and crystalline waters are great, but beaches can come in all shades
The title for the above text is: Forget White Sand Beaches, These Shores Are Pink, Black, Even Green.
2. The text is: All of these spa hotels and resorts are highly rated by our readers -- and all offer transporting natural surroundings to make your stay feel like a true escape.
The title for the above text is: For Your Daydreaming Pleasure: 7 Spas With Unbelievable Views
3. The text is: Condé Nast Traveler readers voted in our 2012 Cruise Poll and here are the results: the best small ships in the world.
The title for the above text is: Top 10 Small Cruise Ships In The World
4. The text is: -- Kate Parham, Condé Nast Traveler With the White House, more than 160 embassies, and Congress all in our nation's capitol
The title for the above text is: 9 Restaurants With Secret Service Entrances
5. The text is: From $40,000 suites with remote-controlled

In [17]:
print(len(retrieval_docs))

10


PALM USING GOOGLE API

In [20]:
pip install -U -q google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
import google.generativeai as palm

import textwrap
import numpy as np
import pandas as pd

In [22]:
palm.configure(api_key='AIzaSyBVo_JbfzrPBpHbueQtOiRozzyFK1QK8D0')

In [23]:
# Ensure 'query' is defined with a meaningful value
query = "The entire list of 154 properties spans the globe, from a tiny one-suite find in Chiang Mai to a luxe safari camp in Tanzania."

In [30]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, including all relevant background information. I'm providing you with some sample text and title written by me for 7 texts examples.
  Based on the context provided, what's a relevant title for the following news article. Just give titles.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [31]:
passage = r_d

In [32]:
prompt = make_prompt(query, passage)
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, including all relevant background information. I'm providing you with some sample text and title written by me for 7 texts examples.
  Based on the context provided, what's a relevant title for the following news article. Just give titles.
  QUESTION: 'The entire list of 154 properties spans the globe, from a tiny one-suite find in Chiang Mai to a luxe safari camp in Tanzania.'
  PASSAGE: '1. The text is: -- By Laura Ratliff, Condé Nast Traveler Sure, white sand and crystalline waters are great, but beaches can come in all shades The title for the above text is: Forget White Sand Beaches, These Shores Are Pink, Black, Even Green. 2. The text is: All of these spa hotels and resorts are highly rated by our readers -- and all offer transporting natural surroundings to make your stay feel like a true escape. The title for the abo

In [33]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]

text_model = text_models[0]

In [34]:
temperature = 0.5
answer = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=3,
                            temperature=temperature,
                            max_output_tokens=1000)

In [35]:
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i}: {candidate['output']}\n")

Candidate 0: Conde Nast Traveler's 2016 Readers' Choice Awards: Best Hotels in the World

Candidate 1: The World's 100 Best Hotels 2018

Candidate 2: The World's Best Small Hotels for 2016

