In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [65]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

def load_documents():
  loader = PyPDFDirectoryLoader("./data/ooad")
  documents = loader.load()
  return documents

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document

def split_documents(documents: list[Document]):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80, length_function=len, is_separator_regex=False)
  texts = text_splitter.split_documents(documents)
  return texts

In [81]:
documents = load_documents()
print(documents)
chunks = split_documents(documents)
# print(chunks[0])
print(f"Chunk size - {len(chunks)}")

# for _ in range(len(chunks)):
#   print(chunks[_])

[Document(metadata={'source': 'data\\ooad\\10_Design_Pattern_Handout.pdf', 'page': 0}, page_content='Object – Oriented Analysis and Design\nDesign Pattern\nInstructor: Le Thi Ngoc Hanh, Ph.D\nltnhanh@hcmiu.edu.vn\nVietnam National University of HCMC \nInternational University\nSchool of Computer Science and Engineering\n'), Document(metadata={'source': 'data\\ooad\\10_Design_Pattern_Handout.pdf', 'page': 1}, page_content='VNU-HCM International University School of Computer Science and Engineering\nSlide 2\nDesign patterns are typical solutions to commonly occurring \nproblems in software design. Design patterns in software development \nprovide established solutions to solve recurring design problems.\nWhat are Design Patterns\n'), Document(metadata={'source': 'data\\ooad\\10_Design_Pattern_Handout.pdf', 'page': 2}, page_content='VNU-HCM International University School of Computer Science and Engineering\nSlide 3\n\uf053 Simplifies Communication: Patterns give developers a \nshared lan

In [9]:
def unique_chunk_id(chunks):
  map = dict()

  for chunk in chunks:
    chunk.metadata['id'] = 0
    # print(f"{chunk.metadata['source']}:{chunk.metadata['page']}")

  for chunk in chunks:
    # print(f"{chunk.metadata['source']}:{chunk.metadata['page']}:{chunk.metadata['id']}")
    text = chunk.metadata['source'] + ':' + str(chunk.metadata['page'])
    if text not in map.keys():
      map[text] = 0
    else:
      map[text] += 1
    chunk.metadata['id'] = map[text]

  # for chunk in chunks:
  #   print(f"{chunk.metadata['source']}:{chunk.metadata['page']}:{chunk.metadata['id']}")
  return chunks


# print(f'{unique_chunk_id(chunks)}')

In [10]:
from langchain_chroma import Chroma

def add_to_chroma(chunks: list[Document]):
  db = Chroma(persist_directory='./data/db', embedding_function=embedding_model)

  # unique page id
  chunks_with_ids = unique_chunk_id(chunks)
  # print(chunks_with_ids)

  # add or update documents
  existing_items = db.get(include=[])  # IDs are always included by default
  # print(existing_items)

  existing_ids = set(existing_items["ids"]) # type string
  print(f"Number of existing documents in DB: {len(existing_ids)}")

  # print(existing_ids)

  start_id = len(existing_ids)

  # Only add documents that don't exist in the DB.
  new_chunks = []
  new_chunk_ids = []
  for chunk in chunks_with_ids:
    if str(start_id) not in existing_ids:
      new_chunks.append(chunk)
    start_id += 1
    new_chunk_ids.append(str(start_id))

  # for i in new_chunks:
  #   print(i)

  if len(new_chunks):
    print(f"Adding new documents: {len(new_chunks)}")
    # print(new_chunk_ids)
    # print(type(new_chunk_ids))
    db.add_documents(new_chunks, ids=new_chunk_ids) # all type should be string, ids should be unique
  else:
    print("No new documents to add")

In [67]:
add_to_chroma(chunks)

Number of existing documents in DB: 87
Adding new documents: 254


In [28]:
def semantic_search(query_text):
  db = Chroma(persist_directory='./data/db', embedding_function=embedding_model)

  results = db.similarity_search_with_score(query_text, k=5)
  context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

  documents = dict()
  for doc, _score in results:
    source = doc.metadata['source']
    page = doc.metadata['page']
    if source not in documents:
      documents[source] = [page]
    else:
      documents[source].append(page)

  for key, value in documents.items():
    documents[key].sort()

  return context_text, documents

In [13]:
import os

HUGGINGFACEHUB_API_TOKEN = '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'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [14]:
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate

# question = 'Soccer'
question = """Which of the following is a property of a hash table?
   A) The average time complexity for search, insert, and delete operations is always O(1).
   B) The hash function used must be perfect, meaning it produces no collisions.
   C) The load factor (number of elements / number of buckets) must always be less than 1.
   D) The hash table must be resized whenever the number of elements inserted reaches a certain threshold.
"""
template = """
Question: {question}
Answer:You are an expert in referring documents in Computer Science. Think step by step to answer the question and 
refer to me a link relate to above question. """

prompt = PromptTemplate.from_template(template)

In [15]:
template_summary = """
Answer: You are an expert in summarize knowledge in Computer Science, 
now analyze to understand the {sentence}.Then summarize an important concepts 
in a meaningful way.
"""
prompt_summary = PromptTemplate.from_template(template_summary)

In [37]:
topic = 'Data Structure and Algorithm'
topic2 = 'Object Oriented Programming'
topic3 = 'Computer Network'
topic4 = 'SQL Database'
topic5 = 'Operating System'
topic6 = 'Machine Learning'
topic7 = 'Natural Language Processing'
topic8 = 'Computer Vision'

# Easy question
template_MCQ="""
Topic: You are an expert in Computer Science and get the knowledge in {topic}.
Answer: Generate to me 5 MCQs relate to above topic. Don't show the answers.
"""
prompt_MCQ = PromptTemplate.from_template(template_MCQ)

In [49]:
# Intermediate questions (various sub-topics) => RAG
template_sub_topic="""
Topic: {topic}
Answer: You are an expert in Computer Science. List to 
me 5 interested sub-topics in above topic. Note just print only sub-topic name.
"""
prompt_sub_topic = PromptTemplate.from_template(template_sub_topic)

template_MCQ_RAG="""
Context: You are an expert in Computer Science. As you know, {context}
Answer: Generate to me 2 MCQs relate to context. Don't show the answers.
"""
prompt_MCQ_RAG = PromptTemplate.from_template(template_MCQ_RAG)

In [18]:
# Hard questions (hard sub-topics)  => RAG + CoT
template_COT_sub_topic = """
Topic: {topic}
Answer: You are an expert in Computer Science. List to 
me 2 challenged sub-topics in above topic.
"""
prompt_COT_sub_topic = PromptTemplate.from_template(template_COT_sub_topic)

template_COT_MCQ = """
Context: You are an expert in Computer Science. Here is one of the hardest concept 
in Computer Science as you know, {context}.
Answer: Generate to me 1 hard MCQ relate to context. Don't show the answers.
"""
prompt_COT_MCQ = PromptTemplate.from_template(template_COT_MCQ)

In [63]:
def split_5_topic(topic):
  ids = dict()
  num = ['1', '2', '3', '4', '5']
  for i in range(len(topic)):
    if topic[i] in num:
      ids[topic[i]] = i
  pos = []
  for k, v in ids.items():
    # print(v)
    pos.append(v)
  ans = []
  for i in range(len(pos)-1):
    ans.append(topic[pos[i]:pos[i + 1]])
  ans.append(topic[pos[len(pos) - 1]: len(topic)])
  # for i in range(len(ans)):
  #   print(ans[i])
  return ans

def split_2_topic(topic):
  ids = dict()
  num = ['1', '2']
  for i in range(len(topic)):
    if topic[i] in num:
      ids[topic[i]] = i
  pos = []
  for k, v in ids.items():
    pos.append(v)
  ans = []
  for i in range(len(pos)-1):
    ans.append(topic[pos[i]:pos[i + 1]])
  return ans

In [20]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

# model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
model_id = 'mistralai/Mistral-Nemo-Instruct-2407'
# model_id = 'microsoft/phi-4'
# model_id = 'mistralai/Mixtral-8x7B-Instruct-v0.1'
# model_id = 'microsoft/Phi-3-small-128k-instruct'

llm = HuggingFaceEndpoint(
    repo_id=model_id,
    max_length=256,
    temperature=0.5,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)


                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.


In [21]:
llm_chain = prompt | llm
print(llm_chain.invoke({'question':question}))



D) The hash table must be resized whenever the number of elements inserted reaches a certain threshold.

Reference: https://www.geeksforgeeks.org/load-factor-in-hash-table/

Explanation: A hash table's performance can degrade as the number of elements inserted increases. To maintain the performance, the hash table needs to be resized whenever the number of elements inserted reaches a certain threshold, known as the load factor. This is done to ensure that the average time complexity for search, insert, and delete operations remains close to O(1). The other options are not accurate representations of a hash table's properties.


In [53]:
llm_chain_sub_topic = prompt_sub_topic | llm
sub_topic = llm_chain_sub_topic.invoke({'topic':topic2})
print(sub_topic)

1. Inheritance
2. Polymorphism
3. Encapsulation
4. Abstraction
5. Association


In [64]:
sub_topic_main = split_5_topic(sub_topic)
print(sub_topic_main)

['1. Inheritance\n', '2. Polymorphism\n', '3. Encapsulation\n', '4. Abstraction\n', '5. Association']


In [78]:
for topic in sub_topic_main:
    topic = topic.replace('\n', '').strip(' .12345')
    # print(topic[0])
    context_text, documents = semantic_search(topic)
    print('context ', context_text)
    print('documents ', documents)
    # llm_chain_summary = prompt_summary | llm
    # context = llm_chain_summary.invoke({'sentence':context_text})
    # print('Context', context)
    # llm_chain_MCQ_RAG = prompt_MCQ_RAG | llm
    # print(llm_chain_MCQ_RAG.invoke({'context':context}))
    print("--------------------")                   

context  VNU-HCM International University School of Computer Science and Engineering
Slide 35
Inheritance

---

VNU-HCM International University School of Computer Science and Engineering
Slide 31
 Classes can reuse code and properties from other classes. 
 If a class has a parent class, it means the class has 
inherited the properties of the parent. The child class can 
also modify or extend the behavior of its parent class. 
Inheritance allows you to reuse code without redefining the 
functions of a child class.
 Use inheritance when there is a clear “is-a” relationship 
between classes.
Inheritance

---

VNU-HCM International University School of Computer Science and Engineering
Slide 34
 Specialization: The act of defining one class as a 
refinement of another.
 Subclass: A class defined in terms of a specialization 
of a superclass using inheritance.
 Superclass: A class serving as a base for inheritance 
in a class hierarchy 
 Inheritance: Automatic duplication of supercla

In [29]:
query = 'Design Pattern'
context_text, documents = semantic_search(query)
print(context_text)

VNU-HCM International University School of Computer Science and Engineering
Slide 6
Four essential components of a design pattern:
• Name: A concise identifier for easy reference (e.g., 
Singleton, Observer).
• Problem: The specific issue or recurring situation 
the pattern addresses.
• Solution: The approach or structure that resolves 
the problem.
• Consequences: The pros and cons of using this 
pattern.
Structure of a Design Pattern

---

VNU-HCM International University School of Computer Science and Engineering
Slide 2
Design patterns are typical solutions to commonly occurring 
problems in software design. Design patterns in software development 
provide established solutions to solve recurring design problems.
What are Design Patterns

---

VNU-HCM International University School of Computer Science and Engineering
Slide 5
How to construct a Design Pattern?

---

VNU-HCM International University School of Computer Science and Engineering
Slide 7
Patterns
Source: https://www.jobs

In [30]:
llm_chain_summary = prompt_summary | llm
context = llm_chain_summary.invoke({'sentence':context_text})
print(context)                   

Design Patterns are reusable solutions to common problems in software design. They are not finished designs that can be transformed directly into code, but rather descriptions of how to solve problems that arise in particular contexts. Design patterns are like blueprints for solving common programming challenges. They provide a general approach or template for solving a specific problem, but the details of how to implement the solution will depend on the specific context in which the pattern is applied.

Design patterns are categorized into three main groups:
1. Creational Patterns: These patterns provide mechanisms for creating objects in such a way as to increase flexibility and reuse of existing code. Examples include Singleton, Factory, and Abstract Factory.
2. Structural Patterns: These patterns deal with the composition of classes or objects into larger structures while keeping these structures efficient. Examples include Adapter, Bridge, and Composite.
3. Behavioral Patterns: Th

In [33]:
llm_chain_MCQ_RAG = prompt_MCQ_RAG | llm
print(llm_chain_MCQ_RAG.invoke({'context':context}))

1. Which of the following is NOT a category of design patterns?
   A) Creational Patterns
   B) Structural Patterns
   C) Functional Patterns
   D) Behavioral Patterns

2. What is the primary goal of using design patterns in software development?
   A) To create finished designs that can be directly transformed into code
   B) To provide a common language and set of solutions for recurring problems
   C) To automate the entire software development process
   D) To generate random code for increased flexibility


In [38]:
llm_chain_MCQ = prompt_MCQ | llm
print(llm_chain_MCQ.invoke({'topic':topic7}))

1. **What is the primary goal of Natural Language Processing (NLP)?**
   A) To understand and generate human language
   B) To create artificial intelligence
   C) To improve search engine results
   D) To translate languages

2. **Which of the following is NOT a common approach in text classification?**
   A) Naive Bayes
   B) Support Vector Machines (SVM)
   C) Decision Trees
   D) Common Sense Reasoning

3. **What does the acronym BERT stand for in the context of NLP?**
   A) Bidirectional Encoder Representations from Transformers
   B) Bidirectional Encoder for Reinforcement Training
   C) Bidirectional Encoder for Rule-based Transformations
   D) Bidirectional Encoder for Text Summarization

4. **Which of the following is a popular library for NLP in Python?**
   A) TensorFlow
   B) PyTorch
   C) NLTK
   D) Both A and C

5. **What is the primary purpose of Named Entity Recognition (NER) in NLP?**
   A) To identify and categorize named entities in text
   B) To generate human-like 