In [None]:
!pip install --upgrade --quiet langchain langchain-community langchain-openai langchain-pinecone
!pip install --upgrade --quiet lark
!pip install --upgrade --quiet install "ipywidgets>=7,<8"

In [None]:
def loaddocs(docs, vectorstore):
        loader = DirectoryLoader(
        path="./",
        glob="*.csv",
        loader_cls=CSVLoader,
        show_progress=True)

        docs = loader.load()

        metadata_field_info = [
            AttributeInfo(
                name="Title", description="The title of the movie", type="string"),
            AttributeInfo(name="Runtime (minutes)",
                          description="The runtime of the movie in minutes", type="integer"),
            AttributeInfo(name="Language",
                          description="The language of the movie", type="string"),
            AttributeInfo(name="Release Year",
                          description="The release year of the movie as an integer", type="integer"),
            AttributeInfo(name="Genre", description="The genre of the movie",
                          type="string or list[string]"),
            AttributeInfo(name="Actors", description="The actors in the movie",
                          type="string or list[string]"),
            AttributeInfo(name="Directors", description="The directors of the movie",
                          type="string or list[string]"),
            AttributeInfo(name="Production Companies",
                          description="The production companies of the movie", type="string or list[string]"),
        ]

        def convert_to_list(doc, field):
            if field in doc.metadata and doc.metadata[field] is not None:
                doc.metadata[field] = [item.strip()
                                      for item in doc.metadata[field].split(',')]

        def convert_to_int(doc, field):
            if field in doc.metadata and doc.metadata[field] is not None:
                doc.metadata[field] = int(
                    doc.metadata[field])

        fields_to_convert_list = ['Genre', 'Actors', 'Directors',
                                  'Production Companies', 'Stream', 'Buy', 'Rent']
        fields_to_convert_int = ['Runtime (minutes)', 'Release Year']

        # Set 'overview' and 'keywords' as 'page_content' and other fields as 'metadata'
        for doc in docs:
            # Parse the page_content string into a dictionary
            page_content_dict = dict(line.split(": ", 1)
                                    for line in doc.page_content.split("\n") if ": " in line)

            doc.page_content = 'Overview: ' + page_content_dict.get(
                'Overview') + '. Keywords: ' + page_content_dict.get('Keywords')
            doc.metadata = {field.name: page_content_dict.get(
                field.name) for field in metadata_field_info}

            # Convert fields from string to list of strings
            for field in fields_to_convert_list:
                convert_to_list(doc, field)

            # Convert fields from string to integers
            for field in fields_to_convert_int:
                convert_to_int(doc, field)

        # Create record manager
        namespace = f"pinecone/{pinecone_index_name}"
        record_manager = SQLRecordManager(
            namespace, db_url="sqlite:///record_manager_cache.sql"
        )

        record_manager.create_schema()

        # Upload documents to pinecone
        index(docs, record_manager, vectorstore, cleanup="full")

loaddocs(chat.docs, chat.vectorstore)

In [None]:
# Pinecone
from pinecone import Pinecone

# Langchain
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chains.query_constructor.base import (StructuredQueryOutputParser,get_query_constructor_prompt,AttributeInfo)

from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers.self_query.pinecone import PineconeTranslator
from langchain_openai import (ChatOpenAI,OpenAIEmbeddings)

from langchain_pinecone import PineconeVectorStore
from langchain.indexes import SQLRecordManager, index
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

from langchain_core.output_parsers import StrOutputParser

#Widgets
from ipywidgets import widgets
from IPython.display import display

class Agent:

    RETRIEVER_MODEL_NAME = 'gpt-4o' ##"gpt-4o" / 'gpt-3.5-turbo-0125'
    SUMMARY_MODEL_NAME = 'gpt-4o'
    constructor_prompt = None
    vectorstore = None
    retriever = None
    rag_chain_with_source = None
    docs = None

    def __init__(self, openai_api_key, pinecone_api_key, pinecone_index_name):
        self.initialize_query_constructor()
        self.initialize_vector_store(
            openai_api_key, pinecone_api_key, pinecone_index_name)
        self.initialize_retriever(openai_api_key)
        self.initialize_chat_model(openai_api_key)

##QUERY BUILDER      ---------------------------------------------------------------------------------------

    def initialize_query_constructor(self):
        document_content_description = "Brief overview of a movie, along with keywords"

        # Define allowed comparators list
        allowed_comparators = [
            "$eq",  # Equal to (number, string, boolean)
            "$ne",  # Not equal to (number, string, boolean)
            "$gt",  # Greater than (number)
            "$gte",  # Greater than or equal to (number)
            "$lt",  # Less than (number)
            "$lte",  # Less than or equal to (number)
            "$in",  # In array (string or number)
            "$nin",  # Not in array (string or number)
            "$exists",  # Has the specified metadata field (boolean)
        ]

        examples = [
            ("I'm looking for a sci-fi comedy released after 2021.",
                {
                    "query": "sci-fi comedy",
                    "filter": "and(eq('Genre', 'Science Fiction'), eq('Genre', 'Comedy'), gt('Release Year', 2021))",
                },),
            ("Show me critically acclaimed dramas without Tom Hanks.",
                {
                    "query": "critically acclaimed drama",
                    "filter": "and(eq('Genre', 'Drama'), nin('Actors', ['Tom Hanks']))",
                },),
            ("Recommend some films by Yorgos Lanthimos.",
                {
                    "query": "Yorgos Lanthimos",
                    "filter": 'in("Directors", ["Yorgos Lanthimos]")',
                },),
            ("Films similar to Yorgos Lanthmios movies.",
                {
                    "query": "Dark comedy, absurd, Greek Weird Wave",
                    "filter": 'NO_FILTER',
                },),
            ("Find me thrillers with a strong female lead released between 2015 and 2020.",
                {
                    "query": "thriller strong female lead",
                    "filter": "and(eq('Genre', 'Thriller'), gt('Release Year', 2015), lt('Release Year', 2021))",
                },),
            ("Find me highly rated drama movies in English that are less than 2 hours long",
                {
                    "query": "Highly rated drama English under 2 hours",
                    "filter": 'and(eq("Genre", "Drama"), eq("Language", "English"), lt("Runtime (minutes)", 120))',
                },),
           ("Find me popular movies in English that are less than 2 hours long",
                {
                    "query": "English under 2 hours",
                    "filter": 'and(eq("Language", "English"), lt("Runtime (minutes)", 120))',
                },),
        ]

        metadata_field_info = [
            AttributeInfo(name="Title", description="The title of the movie", type="string"),
            AttributeInfo(name="Genre", description="The genre of the movie", type="string or list[string]"),
            AttributeInfo(name="Runtime (minutes)", description="The runtime of the movie in minutes", type="integer"),
            AttributeInfo(name="Language", description="The language of the movie", type="string"),
            AttributeInfo(name="Release Year", description="The release year of the movie", type="integer"),
            AttributeInfo(name="Actors", description="The actors in the movie", type="string or list[string]"),
            AttributeInfo(name="Directors", description="The directors of the movie", type="string or list[string]"),
            AttributeInfo(name="Production Companies", description="The production companies of the movie", type="string or list[string]"),
        ]

        self.constructor_prompt = get_query_constructor_prompt(
            document_content_description,
            metadata_field_info,
            allowed_comparators=allowed_comparators,
            examples=examples,
        )

##VectorStore & Retriever      ---------------------------------------------------------------------------------------

    def initialize_vector_store(self, open_ai_key, pinecone_api_key, pinecone_index_name):
        pc = Pinecone(api_key=pinecone_api_key)

        # Target index and check status
        pc_index = pc.Index(pinecone_index_name)

        embeddings = OpenAIEmbeddings(model='text-embedding-ada-002',api_key=open_ai_key)
        self.vectorstore = PineconeVectorStore(pc_index, embeddings)


    def initialize_retriever(self, open_ai_key):
        query_model = ChatOpenAI(
            model=self.RETRIEVER_MODEL_NAME,
            temperature=0.2,
            api_key=open_ai_key
        )

        output_parser = StructuredQueryOutputParser.from_components()
        query_constructor = self.constructor_prompt | query_model | output_parser

        self.retriever = SelfQueryRetriever(
            query_constructor=query_constructor,
            vectorstore=self.vectorstore,
            structured_query_translator=PineconeTranslator(),
            search_kwargs={'k': 15}
        )

##AI chatbot     ---------------------------------------------------------------------------------------

    def initialize_chat_model(self, open_ai_key):
        def format_docs(docs):
            return "\n\n".join(f"{doc.page_content}\n\nMetadata: {doc.metadata}" for doc in docs)

        chat_model = ChatOpenAI(
            model=self.SUMMARY_MODEL_NAME,
            temperature=0.2,
            api_key=open_ai_key
        )

        prompt = ChatPromptTemplate.from_messages(
            [
                (
                    'system',
                    """
                    Your goal is to recommend films to users based on their query and the retrieved context.
                    If none of the retrieved films seem relevant, do not recommend films.
                    Aim for three to five film recommendations, as long as the films are relevant.
                    You cannot recommend more than five films.
                    Your recommendation should be relevant, original, and at least two to three sentences long.

                    YOU CANNOT RECOMMEND A FILM IF IT DOES NOT APPEAR IN YOUR
                    CONTEXT.

                    # TEMPLATE FOR OUTPUT
                    Intro text

                    - **Title of Film**:
                        - Runtime:
                        - Release Year:
                        - (Your reasoning for recommending this film.)

                    Question: {question}
                    Context: {context}
                    """
                ),
            ]
        )

        # Create a chatbot Question & Answer chain from the retriever
        rag_chain_from_docs = (
            RunnablePassthrough.assign(
                context=(lambda x: format_docs(x["context"])))
            | prompt
            | chat_model
            | StrOutputParser()
        )

        self.rag_chain_with_source = RunnableParallel(
            {"context": self.retriever, "question": RunnablePassthrough()}
        ).assign(answer=rag_chain_from_docs)

    def ask(self, query: str):
        output = {}
        curr_key = None
        for chunk in self.rag_chain_with_source.stream(query):
            for key in chunk:
                if key != curr_key:
                    print(f"\n\n{key}: {chunk[key]}", end="", flush=True)
                else:
                    print(chunk[key], end="", flush=True)
                curr_key = key

openai_api_key = "***"
pinecone_api_key = "***"
pinecone_index_name = "***"

chat = Agent(openai_api_key, pinecone_api_key, pinecone_index_name)

##GUI      ---------------------------------------------------------------------------------------

input = widgets.Textarea(value='')
submit_button=widgets.Button(description='Submit text',button_style='success')

form_item_layout = widgets.Layout(
    display='flex',
    flex_flow='row',
    justify_content='space-between'
)


form_items = [
    widgets.Box([input], layout=form_item_layout),
    widgets.Box([submit_button], layout=form_item_layout)
    ]

form = widgets.Box(form_items)

def on_button_clicked(b):
  chat.ask(input.value)
  print('\n')
  print(chat.retriever.query_constructor.invoke(input.value))
  input.value =''

submit_button.on_click(on_button_clicked)

In [None]:
##    Recommend Sci-fi after 2000.
##    Films by Tom Hanks.
##    Films from Christopher Nolan.
##    Show me critically acclaimed dramas without Tom Hanks.
##    Recommend movies between 2015 and 2020.
##    I liked ‘Everything Everywhere all at Once’. Give me a similar film, but darker.

In [None]:
display(form)