In [1]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [9]:
from langchain_milvus import Milvus
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
import os 

load_dotenv('/home/iliya/education-agent/backend/src/.env')
MILVUS_URI = "./milvus_example.db"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print(OPENAI_API_KEY)

def setup_vectorstore():
    return Milvus(
        embedding_function=OpenAIEmbeddings(OPENAI_API_KEY),
        connection_args={"uri": MILVUS_URI},
        collection_name="education_documents"
    )

def add_documents_to_vectorstore(vectorstore, documents):
    vectorstore.add_documents(documents)

None


In [8]:
setup_vectorstore()

TypeError: BaseModel.__init__() takes 1 positional argument but 2 were given

In [2]:
from langchain_openai import ChatOpenAI
import os

class LLMInterface:
    def __init__(self, model_name="gpt-4o"):
        self.llm = ChatOpenAI(
            model_name=model_name,
            temperature=0.2,
            openai_api_key=os.getenv("OPENAI_API_KEY")
        )

    def get_llm(self):
        return self.llm

    def generate_response(self, prompt):
        return self.llm.predict(prompt)

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import os

class Preprocessor:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )

    def process_text(self, text):
        return self.text_splitter.split_text(text)

    def process_pdf(self, pdf_path):
        loader = PyPDFLoader(pdf_path)
        pages = loader.load_and_split()
        return self.text_splitter.split_documents(pages)

    def process_all_pdfs(self):
        all_docs = []
        for filename in os.listdir("../data/pdf"):
            if filename.endswith(".pdf"):
                file_path = os.path.join("../data/pdf", filename)
                all_docs.extend(self.process_pdf(file_path))
        return all_docs

In [28]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

class VectorStore:
    def __init__(self):
        self.embeddings = OpenAIEmbeddings()
        self.vectorstore = Chroma(
            persist_directory="./chroma_db",
            embedding_function=self.embeddings
        )

    def add_texts(self, texts):
        self.vectorstore.add_texts(texts)

    def add_documents(self, documents):
        self.vectorstore.add_documents(documents)

    def similarity_search(self, query, k=4):
        return self.vectorstore.similarity_search(query, k=k)

    def get_retriever(self, k=10):
        return self.vectorstore.as_retriever()

In [29]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

class Retriever:
    def __init__(self):
        self.retriever = VectorStore().vectorstore.as_retriever()

    def format_docs(self, docs):
        """Formats the retrieved documents into a readable string format."""
        return "\n\n".join(doc.page_content for doc in docs)

    def retrieve_chain(self):
        return {"context": self.retriever | self.format_docs}

In [30]:
import os
import json

def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def save_to_json(data, filename):
    ensure_directory_exists("./data/processed")
    with open(os.path.join("./data/processed", filename), 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def load_from_json(filename):
    file_path = os.path.join("./data/processed", filename)
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    return None

def save_to_text(data, filename):
    ensure_directory_exists("./data/processed")
    text_path = os.path.join("./data/processed", filename)
    with open(text_path, 'w', encoding='utf-8') as f:
        if isinstance(data, dict):
            for key, value in data.items():
                f.write(f'{key}: {value}\n')
        elif isinstance(data, list):
            for item in data:
                f.write(f'- {item}\n')
        else:
            f.write(str(data))

In [31]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

class EducationAnalyzer:
    def __init__(self):
        self.llm = LLMInterface().get_llm()
        self.retriever = VectorStore().get_retriever()
        self.prompt = PromptTemplate(
            input_variables=["country", "context"],
            template="""
            You are an expert in education systems. Your task is to analyze the education system of {country} based strictly on the following information provided:
            {context}

            In context you will be provided with metadata of the context, with contains sourse, page. Try to make references to that text in your analysis as much as possible. Add to page number 1, so it numbering starts with 1. Reference should be in format: [Document name : page number].
            Important: Do not fabricate or infer any data that is not explicitly mentioned in the provided context. Your analysis should be grounded in the given information only. If any details are missing, indicate so without making assumptions.

            Provide the analysis in the following format:
            1. PISA results (if included in the context)
            2. Useful experience for Ukraine
            3. Mission and vision (if included in the context)
            4. Current development strategies
            5. Key features of the education system
            6. Key competencies
            7. General description of the education system's product
            8. Outcomes of this educational system in terms of soft skills. Format the outcomes as a table with 3 columns: how it is formed, the outcome itself, and its generalized name.
            
            Example table (for Finnish Educational System):
            Learning Approach  | Outcome                                                | General Category
            ------------------------------------------------------------------------------------------
            Inquiry-Based Learning: Finnish education encourages students to ask         | Graduates who can analyze complex situations, generate innovative solutions, and think independently. | Critical Thinking and Problem-Solving Skills
            questions, think critically, and engage in problem-solving activities.
            Creative Thinking: Emphasis on open-ended tasks and projects fosters          | Graduates skilled in generating creative solutions.                | Innovation and Creativity
            creativity and innovation.
            Self-Directed Learning: Students are given autonomy to pursue their           | Individuals committed to lifelong learning and adaptable to new     | Lifelong Learning Attitude
            interests, promoting intrinsic motivation.                                    | learning opportunities.
            
            Remember: Do not invent any information. Use only the data given in the context.
            """
        )

    def analyze_country(self, country):
        docs = self.retriever.get_relevant_documents(country)
        context = "\n".join([
            json.dumps({
                "content": doc.page_content,
                "metadata": doc.metadata
            }) for doc in docs
        ])
        chain = self.prompt | self.llm | StrOutputParser()
        return chain.invoke({"country": country, "context": context})

In [32]:
ADD_DOCUMENTS = False

In [33]:
preprocessor = Preprocessor()
vectorstore = VectorStore()
education_analyzer = EducationAnalyzer()

pdf_data = preprocessor.process_all_pdfs()

if ADD_DOCUMENTS:
    vectorstore.add_documents(pdf_data)
    
countries = ["Finnish", "Estonian", "Polish"]
analyses = {}
for country in countries:
    analysis = education_analyzer.analyze_country(country)
    save_to_json(analysis, f"{country.lower()}_analysis.json")
    save_to_text(analysis, f"{country.lower()}_analysis.txt")
    analyses[country] = analysis