In [1]:
from transformers import XLNetTokenizer, XLNetModel
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json

file_path = 'tweet.json'
with open(file_path, 'r', encoding='utf-8') as file:
    dataset = json.load(file)
    
# Load pre-trained XLNet tokenizer and model
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
# model = XLNetModel.from_pretrained('xlnet-base-cased')

In [2]:
import json
import pandas as pd

In [3]:
data = pd.DataFrame.from_dict(dataset)
data.head()

Unnamed: 0,id,input,profile
0,600,Paraphrase the following tweet without any exp...,[{'text': 'SARS .. H1N1 .. Air France .. plea...
1,601,Paraphrase the following tweet without any exp...,[{'text': '@kAtrinaDaniels never thought that ...
2,602,Paraphrase the following tweet without any exp...,[{'text': '@mattmaloney I feel cheered up. Wow...
3,603,Paraphrase the following tweet without any exp...,[{'text': 'Night all. Watched Mamma Mia. Did I...
4,604,Paraphrase the following tweet without any exp...,"[{'text': '@hoffifer working, as usual .. Awes..."


In [4]:
query_start = 'Paraphrase the following tweet without any explanation before or after it: '
query = dataset[0]['input'][len(query_start):]
print(query)

I'm currently enjoying the album "Listen to Eason Chan."


In [5]:
documents = []
for doc in dataset[0]['profile']:
    documents.append(doc['text'])
print(len(documents))

24


In [6]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")

In [7]:
def paraphrase_t5(
    input_sentence,
    num_beams=5,
    num_beam_groups=5,
    num_return_sequences=5,
    repetition_penalty=10.0,
    diversity_penalty=3.0,
    no_repeat_ngram_size=2,
    temperature=0.7,
    max_length=500
):
    input_ids = tokenizer(
        f'paraphrase: {input_sentence}',
        return_tensors="pt", padding="longest",
        max_length=max_length,
        truncation=True,
    ).input_ids

    outputs = model.generate(
        input_ids, temperature=temperature, repetition_penalty=repetition_penalty,
        num_return_sequences=num_return_sequences, no_repeat_ngram_size=no_repeat_ngram_size,
        num_beams=num_beams, num_beam_groups=num_beam_groups,
        max_length=max_length, diversity_penalty=diversity_penalty
    )

    rephrased_versions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return rephrased_versions

# Example usage
input_sentence = query
rephrased_versions = paraphrase_t5(input_sentence)


for i, rephrased_sentence in enumerate(rephrased_versions, start=1):
    print(f"Rephrased Version {i}: {rephrased_sentence}")
    print("-------------------------------------------")



Rephrased Version 1: The album "Listen to Eason Chan" is currently in my favorites list.
-------------------------------------------
Rephrased Version 2: I'm currently enjoying "Listen to Eason Chan."
-------------------------------------------
Rephrased Version 3: My current listening pleasure is the album "Listen to Eason Chan."
-------------------------------------------
Rephrased Version 4: "Listen to Eason Chan" is the one album I'm currently loving.
-------------------------------------------
Rephrased Version 5: At the moment, I am enjoying my new album "Listen to Eason Chan."
-------------------------------------------


In [8]:
# Load pre-trained XLNet tokenizer and model
tokenizer_xlnet = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model_xlnet = XLNetModel.from_pretrained('xlnet-base-cased')

In [9]:
user_profile = dataset[0]["profile"]
print(len(user_profile))

24


In [10]:
document_embedding = []
# Process each document in the first user profile
for profile in user_profile:
    # Tokenize and extract embeddings for each document
    tokenized_document = tokenizer_xlnet(profile["text"], return_tensors='pt')
    with torch.no_grad():
        document_outputs = model_xlnet(**tokenized_document)
    # Use the last layer output as the document embedding
    document_embedding.append(document_outputs.last_hidden_state.mean(dim=1))

In [11]:
# Initialize lists to store relevant documents and scores for each version
all_relevant_documents = []
all_relevant_scores = []

In [14]:
for i, input_version in enumerate([input_sentence] + rephrased_versions, start=1):
    # Print the input version
    print(f"\nProcessing Input Version {i}: {input_version}\n{'='*50}")

    # Retrieve relevant documents for the current version
    input_documents = []
    input_scores = []

    # Encode the current query version
    tokenized_paraphrase_rob = tokenizer_xlnet(f'paraphrase: {input_version}', return_tensors='pt')
    with torch.no_grad():
            paraphrase_outputs = model_xlnet(**tokenized_paraphrase_rob)
    paraphrase_embedding = paraphrase_outputs.last_hidden_state.mean(dim=1)

    # Calculate cosine similarity between the query and paraphrase embeddings
    # similarity_scores = util.dot_score(paraphrase_embedding, document_embedding)[0].cpu().tolist()
    similarity = []
    for j in range(len(document_embedding)):
        similarity.append(torch.nn.functional.cosine_similarity(document_embedding[j], paraphrase_embedding.squeeze(dim=1)))
    doc_score_title_pairs = list(zip([doc['text'] for doc in dataset[0]['profile']],
                                     similarity))
    
    # Sort by decreasing similarity score
    doc_score_title_pairs = sorted(doc_score_title_pairs, key=lambda x: x[1], reverse=True)

    # Calculate the number of documents to retrieve (top 25%)
    num_documents_to_retrieve = int(0.25 * len(doc_score_title_pairs))

    # Check if the number of documents to retrieve is greater than 15
    if num_documents_to_retrieve > 10:
        num_documents_to_retrieve = 10

    if num_documents_to_retrieve < 5:
        num_documents_to_retrieve = 5
        
    for text, score in doc_score_title_pairs[:num_documents_to_retrieve]:
        print(f"Original Score: {score}")
        print(f"Document: {text}\n")
        
        # Store relevant document, title, and score for each interpretation
        input_documents.append({'text': text})
        input_scores.append(score)
        
        print("="*50)
        
    # Store relevant documents and scores for each version
    all_relevant_documents.append(input_documents)
    all_relevant_scores.append(input_scores)


Processing Input Version 1: I'm currently enjoying the album "Listen to Eason Chan."
Original Score: tensor([0.9802])
Document: listening to eason's 2006 album .. What's going on...? This is my favourite eason album  it's 3.38am

Original Score: tensor([0.9796])
Document: SARS .. H1N1 .. Air France ..  please cherish your life, people ..

Original Score: tensor([0.9796])
Document: i am at interchange .. Just missed the bus 

Original Score: tensor([0.9794])
Document: it's friday !! And i just got on the bus .. Going to work later today again 

Original Score: tensor([0.9794])
Document: @waxyx informatics , do u know that? (via @waxyx)no I meant which school haha  I am in ntu

Original Score: tensor([0.9780])
Document: &quot;See ... You make the world go weird ...&quot; from weiwei's SMS 


Processing Input Version 2: The album "Listen to Eason Chan" is currently in my favorites list.
Original Score: tensor([0.9777])
Document: &quot;See ... You make the world go weird ...&quot; from we

In [17]:
# Calculate average score for each document across different interpretations
average_scores = {}
for documents, scores in zip(all_relevant_documents, all_relevant_scores):
    for doc_dict, score in zip(documents, scores):
        if isinstance(doc_dict, dict):  # Check if it's a dictionary
            doc_text = doc_dict.get('text', '')  # Use 'get' to provide a default value if 'text' is not present
            if doc_text:
                if doc_text not in average_scores:
                    average_scores[doc_text] = {'text': doc_text, 'scores': []}
                average_scores[doc_text]['scores'].append(score)

# Calculate average score for each document
average_documents = [{'text': details['text'], 'average_score': sum(details['scores']) / len(details['scores'])}for details in average_scores.values()]

# Sort documents based on average scores
sorted_documents = sorted(average_documents, key=lambda x: x['average_score'], reverse=True)

retrieval_docs = []

# Output top 10 documents based on average scores
print("\nTop  Documents :")
for doc_dict in sorted_documents[:10]:
    new_doc_dict = {'text': doc_dict['text']}
    retrieval_docs.append(new_doc_dict)
print(retrieval_docs)


Top  Documents :
[{'text': "@waxyx I don't know .. I wanted to restart it .. I switch it off and it won't turn on again "}, {'text': "addicted to twitter. Time to get out of bed. It's monday "}, {'text': "it's friday !! And i just got on the bus .. Going to work later today again "}, {'text': 'SARS .. H1N1 .. Air France ..  please cherish your life, people ..'}, {'text': 'i am at interchange .. Just missed the bus '}, {'text': '@waxyx informatics , do u know that? (via @waxyx)no I meant which school haha  I am in ntu'}, {'text': "listening to eason's 2006 album .. What's going on...? This is my favourite eason album  it's 3.38am"}, {'text': "&quot;See ... You make the world go weird ...&quot; from weiwei's SMS "}, {'text': 'Finished blogging .. continue to rate restaurants on Facebook .. I wanna get the trophy after rating 100 restaurants '}]


In [19]:
r_d = ''
count = 1
for i in retrieval_docs:
  # print(i)
  r_d += str(count) + '. '
  r_d += 'The text is: ' + i['text'] + '\n'
  count += 1
  # if count > 7:
  #   break

print(r_d)

1. The text is: @waxyx I don't know .. I wanted to restart it .. I switch it off and it won't turn on again 
2. The text is: addicted to twitter. Time to get out of bed. It's monday 
3. The text is: it's friday !! And i just got on the bus .. Going to work later today again 
4. The text is: SARS .. H1N1 .. Air France ..  please cherish your life, people ..
5. The text is: i am at interchange .. Just missed the bus 
6. The text is: @waxyx informatics , do u know that? (via @waxyx)no I meant which school haha  I am in ntu
7. The text is: listening to eason's 2006 album .. What's going on...? This is my favourite eason album  it's 3.38am
8. The text is: &quot;See ... You make the world go weird ...&quot; from weiwei's SMS 
9. The text is: Finished blogging .. continue to rate restaurants on Facebook .. I wanna get the trophy after rating 100 restaurants 



In [20]:
print(len(retrieval_docs))

9


In [21]:
pip install -U -q google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
import google.generativeai as palm

import textwrap
import numpy as np
import pandas as pd

In [23]:
palm.configure(api_key='AIzaSyBVo_JbfzrPBpHbueQtOiRozzyFK1QK8D0')

In [26]:
# Ensure 'query' is defined with a meaningful value
query = """I'm currently enjoying the album "Listen to Eason Chan." """

In [33]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = textwrap.dedent("""You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. I'm providing you with some sample tweets written by me for 9 texts examples.
  Based on the context provided, can you paraphrase the query to capture my writing style. Make sure it looks like its written by me.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [34]:
prompt = make_prompt(query, passage)
print(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.   
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. I'm providing you with some sample tweets written by me for 9 texts examples.
  Based on the context provided, can you paraphrase the query to capture my writing style. Make sure it looks like its written by me.
  QUESTION: 'I'm currently enjoying the album "Listen to Eason Chan." '
  PASSAGE: '1. The text is: @waxyx I dont know .. I wanted to restart it .. I switch it off and it wont turn on again  2. The text is: addicted to twitter. Time to get out of bed. Its monday  3. The text is: its friday !! And i just got on the bus .. Going to work later today again  4. The text is: SARS .. H1N1 .. Air France ..  please cherish your life, people .. 5. The text is: i am at interchange .. Just missed the bus  6. The text is: @waxyx informatics , do u know that? (via 

In [36]:
text_models = [m for m in palm.list_models() if 'generateText' in m.supported_generation_methods]

text_model = text_models[0]

In [44]:
temperature = 0.6
answer = palm.generate_text(prompt=prompt,
                            model=text_model,
                            candidate_count=3,
                            temperature=temperature,
                            max_output_tokens=1000)

In [45]:
for i, candidate in enumerate(answer.candidates):
  print(f"Candidate {i+1}: {candidate['output']}\n")

Candidate 1: I'm currently enjoying the album "Listen to Eason Chan." 

Candidate 2: I am currently listening to the album "Listen to Eason Chan."

Candidate 3: I'm currently listening to the album "Listen to Eason Chan."

