In [28]:
import os
from langchain.docstore.document import Document

In [9]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

In [12]:
prompt = """You are an AI assistant that generates relevant and natural-sounding questions based on the following text within the triple backticks, which is sourced from Brandeis University's knowledge base. 
        Text:```{chunk}```
        Generate a list of diverse and well-structured questions that a student, faculty member, or visitor might ask based on this content. 
        Ensure the questions:
            - Are clear and concise.
            - Are specific to Brandeis University.
            - Vary in type (fact-based, procedural, explanatory).
            - Avoid redundancy.
        Return 3-5 questions as a numbered list."""

prompt_template = PromptTemplate.from_template(prompt)

In [4]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

In [7]:
import json
from langchain.schema import Document

def load_from_file(filename):
    documents = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            json_record = json.loads(line.strip())
            documents.append(Document(page_content=json_record["page_content"], metadata=json_record["metadata"]))
    return documents

splits = load_from_file('documents.jsonl')

In [22]:
chain = prompt_template | llm

In [18]:
splits[0].page_content

Document(metadata={'source': 'https://www.brandeis.edu/audience/fac-staff.html', 'title': 'Faculty & Staff | Brandeis University'}, page_content='Faculty & StaffExpand AllAcademic ResourcesAcademic CalendarBookstoreCenter for Teaching and LearningDepartments and ProgramsFaculty and Researcher DirectoryFaculty HandbookLATTE(Learning and Teaching Technology Environment)MoodleBrandeis LibraryRegistrarStudent Accessibility Support: Information for FacultyWorkdayAdministrative ResourcesBrandeis Hospitality (Dining)Budget and PlanningBusiness and FinanceBrandeis Stationery and Business Card Ordering PortalCampus Planning and OperationsFacilities ServicesProcurement and Business ServicesPublic SafetyUniversity EventsCOVID- 19  InformationCOVID- 19  Information for Faculty and StaffGovernanceBoard of TrusteesFaculty Governance Task ForceFaculty SenateOffice of the PresidentPrinciples of Free Speech and Free ExpressionHuman ResourcesBenefitsEmployee FormsPaid Time OffHoliday ScheduleJob Opportu

In [40]:
def generate_questions(splits):
    hyp_questions = []
    for split in splits:
        result = chain.invoke(split.page_content)
        result_doc = Document(page_content=result.content, metadata={"chunk": split.page_content, "source": split.metadata["source"]})
        hyp_questions.append(result_doc)
    return hyp_questions
d = generate_questions(splits[0:2])


In [None]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY"))
url = "https://73b67b21-6bf2-4607-befe-cd5a914ddb80.us-west-2-0.aws.cloud.qdrant.io"

In [None]:
qdrant = QdrantVectorStore.from_documents(
    splits[0],
    embeddings,
    url=url,
    prefer_grpc=True,
    api_key=os.getenv("QDRANT_CLUSTER_KEY"),
    collection_name="brandeis.edu",
)