In [17]:
import os
from dotenv import load_dotenv


from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.image import ImagePromptTemplate
from langchain.prompts import PromptTemplate
from langchain_core.prompt_values import ImageURL
from langchain_core.pydantic_v1 import BaseModel, Field, HttpUrl, validator
from langchain.output_parsers import PydanticOutputParser
import google.generativeai as genai

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [18]:
genai.configure(api_key=GOOGLE_API_KEY)

In [19]:
for m in genai.list_models():
    print(m)

Model(name='models/chat-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 Chat (Legacy)',
      description='A legacy text-only model optimized for chat conversations',
      input_token_limit=4096,
      output_token_limit=1024,
      supported_generation_methods=['generateMessage', 'countMessageTokens'],
      temperature=0.25,
      top_p=0.95,
      top_k=40)
Model(name='models/text-bison-001',
      base_model_id='',
      version='001',
      display_name='PaLM 2 (Legacy)',
      description='A legacy model that understands text and generates text as an output',
      input_token_limit=8196,
      output_token_limit=1024,
      supported_generation_methods=['generateText', 'countTextTokens', 'createTunedTextModel'],
      temperature=0.7,
      top_p=0.95,
      top_k=40)
Model(name='models/embedding-gecko-001',
      base_model_id='',
      version='001',
      display_name='Embedding Gecko',
      description='Obtain a distributed representatio

In [20]:
class TagsForAspects(BaseModel):
    
    room_sentiment: list[str] = Field(..., description = "The sentiments of the room the user wants");
    
    room_color: list[str] = Field(..., description = "The overall atmospheres that depicted to a color");
    
    room_size: list[int] = Field(..., description = "room size expressed in square meters", enum = list(range(1, 100)));
    
    housing_type: list[str] = Field(..., description = "The type of housing",
                              enum = ['studio', 'apartment', 'house']);
    
    room_type: list[str] = Field(..., description = "The type of room",
                           enum = ['living room', 'kitchen', 'home office', 'bedroom', 'bathroom', 'dining room', 'office', 'garage', 'basement', 'attic', 'laundry room', 'pantry', 'family room', 'foyer']);

class SentenceToAspect:

    def __init__(self):

        self.GOOGLE_API_KEY = GOOGLE_API_KEY
        self.llm = ChatGoogleGenerativeAI(model = 'gemini-pro', google_api_key = self.GOOGLE_API_KEY,
                                         temperature = 0)

        self.parser = PydanticOutputParser(pydantic_object = TagsForAspects)
        self.prompt = PromptTemplate(
            template = """Answer the user query. \n {format_instructions}\n{query}\n
            
            1. You must extract the formatted aspect from each word or keyword within query sentence, rather than the sentence as a whole.
            2. Please concentrate metric information if it is considerated in the original human message.
            3. If you don't think that there is no appropriate words for those aspects, you must return the word 'none' to the aspect
            """,
            input_variables = ["query"],
            partial_variables = {"format_instructions" : self.parser.get_format_instructions()}
        )

    def query(self, query_sentence: str):

        chain = self.prompt | self.llm | self.parser
        return chain.invoke({"query" : query_sentence})

class ImageToAspect(SentenceToAspect):

    def __init__(self):
        SentenceToAspect.__init__(self)
        
        self.GOOGLE_API_KEY = GOOGLE_API_KEY
        self.lmm = ChatGoogleGenerativeAI(model = 'gemini-pro-vision', google_api_key = self.GOOGLE_API_KEY)

    def query_img(self, url:str = None):

        self.message = HumanMessage(
            content = [
                {'type': 'text',
                 'text': """Here is how you must depict an image of a room in English, making sure to include the following elements:
                 
                 1. Include at least one word that conveys the sentiment of the room. If there are multiple sentiments that describe the image, feel free to include all that apply.
                 2. Describe the overall color scheme of the room using at least two color terms.
                 3. Estimate the size of the room in square meters and provide the figure numerically.
                 4. Determine whether the room belongs to an apartment, studio, or house, and specify which one.
                 5. Guess which part of the house the room might be, such as the living room, bedroom, etc."""}
                ,
                {'type' : 'image_url',
                 'image_url' : url}
            ]           
        )
        
        self.return_sentence_img = self.lmm.invoke([self.message])
        return self.query(query_sentence = self.return_sentence_img.content)


In [21]:
ita = ImageToAspect()
output_ita = ita.query_img(url = 'https://image.ohou.se/i/bucketplace-v2-development/uploads/cards/snapshots/169087905318351049.jpeg')


In [22]:
output_ita

TagsForAspects(room_sentiment=['warm', 'inviting'], room_color=['white', 'light gray', 'brown'], room_size=[25], housing_type=['apartment'], room_type=['living room'])