### Dependency Setup

In [4]:
%%capture
!pip install langchain
!pip install pypdf
!pip install pinecone-client
!pip install tiktoken

### Imports

In [5]:
import os
import json
import openai
import pinecone

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains.question_ansering import load_qa_chain

### Helper Functions

In [6]:
# c_root = 'content/drive/MyDrive/Jarvis'

In [7]:
def load_config():
    config = json.load(open(f'{c_root}/config.json'))
    openai.api_key = config['conf_openai_api_key0']
    openai.api_type = config['conf_openai_api_type']
    openai.api_version = config['conf_openai_api_vers']
    openai.api_base = config['conf_openai_api_base']
    return config


def get_data():
    data_loader = PyPDFLoader(f'{c_root}/movie_transcripts.pdf')
    data = data_loader.load()
    return data


def get_and_preprocess_data():
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 100)
    texts = text_splitter.split_documents(get_data())
    return texts


def create_qa_chain(conf, texts):
    texts_list_ = [t.page_content for t in texts]

    embeddings = OpenAIEmbeddings(openai_api_key = conf['openai_api_key0'])
    pinecone.init(api_key = conf['pinecone_api_key0'], environment = conf['pinecone_env'])
    docsearch = Pinecone.from_texts(texts_list_, embeddings, index_name = conf['pinecone_idx'])

    llm = OpenAI(temperature = 0, openai_api_key = conf['openai_api_key0'])
    chain = load_qa_chain(llm, chain_type = "stuff")
    return docsearch, chain


def get_response(docsearch, chain, query):
    docs = docsearch.similarity_search(query)
    answer = chain.run(input_documents = docs, question = query)
    return answer


def preprocess_questions(queries):
    mapper = json.load(open(f'{c_root}/name_mapper.json'))
    processed_queries = list()
    for query in queries:
        query = [mapper.get(word, word) for word in query.split()]
        query = ' '.join(query)
        processed_queries.append(query)

    return processed_queries



### Driver

In [8]:
conf_params = load_config()
split_text = get_and_preprocess_data()
searchable_docs, qa_chain = create_qa_chain(conf_params, split_text)

In [9]:
questions = ["Who is the father of Tony Stark?",
             "What are the other names of Tony Stark?",
             "What is the soundtrack playing in the background, when Tony Stark was kidnapped by the Ten Rings?"
             ]

processed_questions = preprocess_questions(questions)

for q in processed_questions:
    a = get_response(searchable_docs, qa_chain, q)
    print(f'\nQuestion : {q}\nAnswer : {a}\n')


Question : Who is the father of Tony Stark?
Answer : Howard Stark was the father of Tony Stark.


Question : What are the other names of Tony Stark?
Answer : Anthony Edward Stark, Tony, Iron Man, Merchant of Death are some of the other names of Tony Stark.


Question : What is the soundtrack playing in the background, when Tony Stark was kidnapped by the Ten Rings?
Answer : Back in Black by AC/DC was playing in the background, when Tony Stark rode in the fun-vee.


