In [2]:
import numpy 
import pandas as pd
import torch
from transformers import OPTForCausalLM, GPT2Tokenizer
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer,util


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
def embedding(qs):
    em =model.encode(qs)
    return em

In [5]:
def similarity(query1, query2):
    sim= util.cos_sim(query1, query2)
    return sim

In [6]:
def save_to_txt(embedding, filename):
    with open(filename,'w') as f:
        for value in embedding:
            f.write(str(value)+ ',')

In [15]:
def prepare_query(corpus,se,qs):
    em = embedding(qs)
    q_sim_score= similarity(se,em)
    top_5=sorted(q_sim_score,reverse=True)[0:5]
    data = {}
    for sentence, score in zip(corpus,top_5):
        # data.append([sentence, score])
         data[sentence]=score
    # df = pd.DataFrame(data, columns=['Sentence', 'Similarity Score'])
    
    return data

In [32]:
def prepare_prompt(top_sent,qs):
    top_sentences =list(x.keys())
    prompt =  qs +"\n".join(top_sentences) + "\n"
    return prompt

In [33]:
context=[" Generally speaking, image content may include both visual and semantic content.",
"Visual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.",
" Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.",
"Semantic content is obtained either by textual annotation or by complex inference procedures based on visual content.", 
"This chapter concentrates on general visual contents descriptions.",
"Later chapters discuss domain specific and semantic contents.",
"A good visual content descriptor should be invariant to the accidental variance introduced by the imaging process (e.g., the variation of the illuminant of the scene).However, there is a tradeoff between the invariance and the discriminative power of visual features, since a very wide class of invariance loses the ability to discriminate between essential differences.",
"Invariant description has been largely investigated in computer vision (like object recognition), but is relatively new in image retrieval [8].",
"A visual content descriptor can be either global or local.",
"A global descriptor uses the visual features of the whole image, whereas a local descriptor uses the visual features of regions or objects to describe the image content."]

In [34]:
context

[' Generally speaking, image content may include both visual and semantic content.',
 'Visual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.',
 ' Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.',
 'Semantic content is obtained either by textual annotation or by complex inference procedures based on visual content.',
 'This chapter concentrates on general visual contents descriptions.',
 'Later chapters discuss domain specific and semantic contents.',
 'A good visual content descriptor should be invariant to the accidental variance introduced by the imaging process (e.g., the variation of the illuminant of the scene).However, there is a tradeoff between the invariance and the discriminative power of visual features, since a very wide class of invariance loses the ability to discriminate between essential differences.',
 'Invariant description has

In [35]:
s1 = embedding(context)
s1.shape

(10, 384)

In [36]:
s1

array([[ 0.1276735 , -0.03150531,  0.02126714, ...,  0.01614668,
         0.07686063, -0.07784723],
       [ 0.09080701, -0.06885422,  0.0145974 , ...,  0.05979102,
         0.03004542, -0.03163181],
       [ 0.01572219, -0.06374924,  0.00195766, ...,  0.04744347,
         0.04895998,  0.02553334],
       ...,
       [-0.01173849,  0.06075974,  0.02245931, ...,  0.07059798,
        -0.00260606, -0.09241313],
       [ 0.02216985, -0.02991777, -0.00841484, ...,  0.0925092 ,
         0.02544158, -0.0117879 ],
       [ 0.01890386,  0.0006501 ,  0.05565466, ...,  0.07672178,
        -0.02809634, -0.02797282]], dtype=float32)

In [18]:
save_to_txt(s1,'embedding.txt')

In [37]:
queries = ["What are the components of general visual content in an image?",]

In [38]:
queries

['What are the components of general visual content in an image?']

In [39]:
x= prepare_query(context,s1,queries)

In [40]:
x

{' Generally speaking, image content may include both visual and semantic content.': tensor([0.7373]),
 'Visual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.': tensor([0.6963]),
 ' Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.': tensor([0.5377]),
 'Semantic content is obtained either by textual annotation or by complex inference procedures based on visual content.': tensor([0.5344]),
 'This chapter concentrates on general visual contents descriptions.': tensor([0.4998])}

In [41]:
list(x.keys())

[' Generally speaking, image content may include both visual and semantic content.',
 'Visual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.',
 ' Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.',
 'Semantic content is obtained either by textual annotation or by complex inference procedures based on visual content.',
 'This chapter concentrates on general visual contents descriptions.']

In [42]:
query = "What are the components of general visual content in an image?"

In [43]:
prompt  =prepare_prompt(x,query)

In [44]:
prompt

'What are the components of general visual content in an image? Generally speaking, image content may include both visual and semantic content.\nVisual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.\n Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.\nSemantic content is obtained either by textual annotation or by complex inference procedures based on visual content.\nThis chapter concentrates on general visual contents descriptions.\n'

In [47]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")


input_ids = tokenizer(prompt, return_tensors="pt").input_ids

gen_tokens = model.generate(
    input_ids,
    do_sample=True,
    temperature=0.3,
    max_length=100,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [48]:
print(gen_text)

What are the components of general visual content in an image? Generally speaking, image content may include both visual and semantic content.
Visual content can be very general or domain specific. General visual content include color, texture, shape, spatial relationship, etc.
 Domain specific visual content, like human faces, is application dependent and may involve domain knowledge.
Semantic content is obtained either by textual annotation or by complex inference procedures based on visual content.
This chapter concentrates on general visual contents descriptions.

