In [1]:
import sys
sys.path.append("..") # Adds higher directory to python module path

In [2]:
from dotenv import find_dotenv, load_dotenv

In [3]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings() # text-embedding-ada-002

persist_directory = 'tempDB'

## Here is the new embedding being used
embedding = embeddings

# Now we can load the persisted database from disk, and use it as normal. 
vectordb = Chroma(persist_directory=persist_directory, 
                  embedding_function=embedding)



In [4]:
# Here is the key, we pass the db as the retriever
# K cannot be too small, this will impact the performance -> this k is all the {context} the LLM will have
retriever = vectordb.as_retriever(search_type='similarity', search_kwargs={"k":3}) # force only return 3 results

In [5]:
from langchain.callbacks import get_openai_callback # to get the cost
from langchain.chains import RetrievalQA 
from langchain.chat_models import ChatOpenAI
from langchain import PromptTemplate
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
import pandas as pd

# import util functions
from retrieval.format_util import process_llm_response, add_source_into_cell, add_source_into_another_column

# import pydantic object
from langchain.output_parsers import PydanticOutputParser
from schema.structure_schema_util import ProgramBaseInfo, ApplicationPeriod, RequiredDocument, RequiredLanguage, RequiredAcademicPerf, UrlLinks


In [6]:
# https://platform.openai.com/docs/models/gpt-3-5
chat=ChatOpenAI(temperature=0, model_name='gpt-3.5-turbo-0613') 

In [7]:
# wraper funciton
def retreival_qa_chain_wrapper(university_name, program_name, pydanticObject, retriever, chat_model, query, verbose=False):
    '''
    This is just a wrapper function to avoid repeated patten code
    This function includes the system and message message -> TBD: can think of to take it out later
    TODO: if it makes snese to pass the university_name and program_name
    '''
        
    system_message_template = '''
    You are a education consultant named TaiGer who specializes study programs.
    You are helping taiwanese students to apply study programs. so please be explicit include the requirement for taiwanese students
    you summarizes the "program information" and provides the information based on the program and university the student want to apply. 
    if the infromation is not from this program, please do not include the answer 
    if you could not find the answer just output "sorry i am not sure"
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Always answer from the perspective of being TaiGer 
    
    ----
    Progrom information:
    {context}\n

    -----
    '''

    human_tamplate = '''
    {question}

    extract the above information in the following format:
    \n{format_instructions}\n
    '''

    # Initialize the output_parser from the pydantic object
    output_parser = PydanticOutputParser(pydantic_object=pydanticObject)

    # This is how the RetrievalQA chain
    prompt = ChatPromptTemplate(
        messages=[
            SystemMessagePromptTemplate.from_template(system_message_template),
            HumanMessagePromptTemplate.from_template(human_tamplate)  
        ],
        partial_variables={"format_instructions": output_parser.get_format_instructions(), 
                           "university_name":university_name, 
                           "program_name":program_name},
        output_parser=output_parser # here we add the output parser to the Prompt template
    )

    with get_openai_callback() as cb:
        # Form RetrievalQA chain
        chain = RetrievalQA.from_chain_type(llm=chat_model, 
                                            chain_type="stuff", # stuff cost you less.
                                            retriever=retriever,
                                            chain_type_kwargs = {"prompt":prompt},
                                            return_source_documents=True)

        llm_response = chain(query) #by-pass the query here

    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

    # results -> use output parser to parse
    result_dict = output_parser.parse(llm_response['result']).dict() # this covert the pydantic object to dict

    source_list = []
    for source in llm_response["source_documents"]:
        source_list.append(source.metadata['source'])

    # print
    if verbose: 
        process_llm_response(llm_response)

    #todo: here need to clean up the output and make it json or pandas like!
    #remember that the output source need to be kept as well for traceability!
    return result_dict, source_list

In [8]:
# main process function
def process_program_information(university_name,
                                program_name,
                                chat_model, 
                                retriever, 
                                verbose=False):
    '''
    process program inforamtion based on the given university + program name
    return pandas dataframe
    '''


    # Set up a prefix prompt
    prefix_template =  "I want to apply for program: {program_name} at university: {university_name}.\n"
    prefix_prompt = PromptTemplate.from_template(prefix_template)
    formatted_prefix_prompt = prefix_prompt.format(program_name=program_name, university_name=university_name)

    # 1. Get ProgramBase Info
    query_base_info = formatted_prefix_prompt + "Can you tell me about this program?"
    programBaseinfo_result, sources = retreival_qa_chain_wrapper(university_name, program_name, ProgramBaseInfo, retriever, chat_model, query_base_info, verbose)
    df_programBaseInfo = add_source_into_another_column(programBaseinfo_result, sources)

    # 2. Get Application Period Info
    query_application_period = formatted_prefix_prompt + "what is the application period and deadline to apply this program?"
    applicationPeriod_result, sources = retreival_qa_chain_wrapper(university_name, program_name, ApplicationPeriod, retriever, chat_model, query_application_period, verbose)
    df_applicationPeriod = add_source_into_another_column(applicationPeriod_result, sources)


    # 3. Get the Required Document Info
    query_required_documents = formatted_prefix_prompt + "Which documents do I need to submit during the online application?"
    requiredDocument_result, sources = retreival_qa_chain_wrapper(university_name, program_name, RequiredDocument, retriever, chat_model, query_required_documents, verbose)
    df_requiredDocument = add_source_into_another_column(requiredDocument_result, sources)


    # 4. Get the Required Language Info
    query_required_language = formatted_prefix_prompt + "what are the required TOFEL/IELTS/GRE/GMAT scores to apply this program?"
    requiredLanguage_result, sources  = retreival_qa_chain_wrapper(university_name, program_name, RequiredLanguage,  retriever, chat_model,query_required_language, verbose)
    df_requiredLanguage = add_source_into_another_column(requiredLanguage_result, sources)



    # 5. Get the required academic performance info
    query_academic_perf = formatted_prefix_prompt + "What are the required academic performance e.g. the overall GPA and required ETCs in any field to apply this program?"
    requiredAcademicPerf_result, sources  = retreival_qa_chain_wrapper(university_name, program_name, RequiredAcademicPerf, retriever, chat_model, query_academic_perf, verbose)
    df_requiredAcademicPerf = add_source_into_another_column(requiredAcademicPerf_result, sources)


    # 6. Get the URL links
    # no need for the source
    query_urls = formatted_prefix_prompt + "Can you provide me the application protal, website, and FPSO links and contact details for apply this program?"
    urlLinks_result, sources = retreival_qa_chain_wrapper(university_name, program_name, UrlLinks, retriever,  chat_model, query_urls, verbose)
    df_urlLinks = add_source_into_another_column(urlLinks_result, sources)


    # Need the keep the index
    df_result = pd.concat([df_programBaseInfo, df_applicationPeriod, df_requiredDocument, df_requiredLanguage, df_requiredAcademicPerf, df_urlLinks], axis=0).reset_index(drop=False)


    return df_result


In [9]:
df_mannheim = process_program_information(university_name="Mannehim", 
                                         program_name="Master in Management", 
                                         chat_model=chat, retriever=retriever, verbose=True)



Total Tokens: 991
Prompt Tokens: 938
Completion Tokens: 53
Successful Requests: 1
Total Cost (USD): $0.001513
{
  "university_name": "Mannheim",
  "program_name": "Master in Management",
  "degree": "Master's degree",
  "country": "Germany",
  "tuition_fee": "€1,500 per semester"
}


Sources:
https://www.mim-essay.com/mannheim-mim
https://www.mim-essay.com/mannheim-mim
https://www.uni-mannheim.de/en/academics/programs/mannheim-master-in-management/
Total Tokens: 1292
Prompt Tokens: 1233
Completion Tokens: 59
Successful Requests: 1
Total Cost (USD): $0.0019675
{
  "semester": "Both",
  "winter_semester_application_start": "01 April",
  "winter_semester_application_deadline": "15 May",
  "summer_semester_application_start": "16 October",
  "summer_semester_application_deadline": "15 November"
}


Sources:
https://www.uni-mannheim.de/en/academics/dates/application-deadlines/#:~:text=Master's%20Programs&text=The%20application%20deadline%20for%20the,deadlines%20are%20subject%20to%20approval

In [10]:
df_mannheim

Unnamed: 0,index,Result,Source
0,university_name,Mannheim,1. https://www.mim-essay.com/mannheim-mim\n2. ...
1,program_name,Master in Management,1. https://www.mim-essay.com/mannheim-mim\n2. ...
2,degree,Master's degree,1. https://www.mim-essay.com/mannheim-mim\n2. ...
3,country,Germany,1. https://www.mim-essay.com/mannheim-mim\n2. ...
4,tuition_fee,"€1,500 per semester",1. https://www.mim-essay.com/mannheim-mim\n2. ...
5,semester,Both,1. https://www.uni-mannheim.de/en/academics/da...
6,winter_semester_application_start,01 April,1. https://www.uni-mannheim.de/en/academics/da...
7,winter_semester_application_deadline,15 May,1. https://www.uni-mannheim.de/en/academics/da...
8,summer_semester_application_start,16 October,1. https://www.uni-mannheim.de/en/academics/da...
9,summer_semester_application_deadline,15 November,1. https://www.uni-mannheim.de/en/academics/da...
