# Load Dataset

In [5]:
!python3 -m pip install --quiet datasets pandas nomic sentence-transformers einops pymongo


[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Connect to MongoDB

In [1]:
import pymongo

connection_string = "mongodb+srv://machkiet2507:machvikiet@kietmach2507.wlsbm.mongodb.net/"

# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(connection_string)

# Create collections to MongoDB

In [37]:
import pandas as pd

# Ingest data into Atlas
db = mongo_client["rag_db"]   # tên db thì giữ nguyên
collection = {}
collection['student_handbook'] = db["student_handbook"]   
collection['recruitment_and_internship_program'] = db["recruitment_and_internship_program"]   
collection['timetable_and_classes'] = db["timetable_and_classes"]   
collection['academic_affairs'] = db["academic_affairs"]   
collection['scholarship'] = db["scholarship"]   
collection['events'] = db["events"] 


In [28]:
documents = df.to_dict("records")
collection['student_handbook'].insert_many(documents)
print("Collections in database:", db.list_collection_names())

Collections in database: ['student_handbook']


## Chunking & vectorize function

Initialize the embedding model (IBM watsonx)

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from ibm_watsonx_ai.foundation_models import Embeddings
import os
from environs import load_dotenv
load_dotenv('./data/.env')
my_credentials = {
  "url": "https://us-south.ml.cloud.ibm.com",
  "apikey": os.environ['WATSONX_APIKEY'],
}

# model_id = 'sentence-transformers/all-minilm-l12-v2'
model_id = 'intfloat/multilingual-e5-large'
gen_parms = None
project_id = os.environ['WATSONX_PROJECT_ID']
space_id = None
verify = False

# Set the truncate_input_tokens to a value that is equal to or less than the maximum allowed tokens for the embedding model that you are using. If you don't specify this value and the input has more tokens than the model can process, an error is generated.

embed_params = {
  EmbedParams.TRUNCATE_INPUT_TOKENS: 512,
}

model = Embeddings(
  model_id=model_id,
  credentials=my_credentials,
  params=embed_params,
  project_id=project_id,
  verify=verify
)

In [41]:
df = pd.read_csv('./data/sample_FIT_news_shortened.csv')
df.head(2)

Unnamed: 0,title,article,file_links,url,type
0,[FUJINET] Tuyển dụng FRESHER DEVELOPERS (JAVA/...,"\nĐịa điểm làm việc: Số 10 Phổ Quang, Phường 2...",[],https://www.fit.hcmus.edu.vn/tin-tuc/d/fujinet...,recruitment
1,Golden Owl Solutions Tuyển dụng Thực tập sinh ...,\nBENEFITSAllowance: 3.000.000 - 6.000.000 VND...,['https://www.fit.hcmus.edu.vn/vn/UserFiles\\7...,https://www.fit.hcmus.edu.vn/tin-tuc/d/golden-...,recruitment


Initialize the chunker (splitter)

In [25]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150,
    )

In [43]:
import copy
def chunk_and_embed(chunker, embed_model, data, content_column='article'):
    index_mapping = {
        'title': '',
        'article': '',
        'embedding': [],
        'school_year': '2024',
        'in_effect': '2024',
        'file_links': [],
        'created_at': '',
        'updated_at': '',
    }
    results = []
    for _,row in data.iterrows():
        chunks = chunker.create_documents([row[content_column]])
        chunks = [chunk.page_content + row['title'] for chunk in chunks] # Add the title of the document into each chunk before embedding
        embeddings = embed_model.embed_documents(chunks)
        for chunk, embed in zip(chunks, embeddings):
            record = copy.deepcopy(index_mapping)
            record['title'] = row['title']
            record['article'] = chunk
            record['embedding'] = embed
            record['file_links'] = row['file_links']
            results.append(record)
    return results


In [44]:
df['type'].unique()

array(['recruitment', 'timetable', 'scholarship', 'academic_affairs',
       'events'], dtype=object)

In [45]:
collection['recruitment_and_internship_program'].insert_many(chunk_and_embed(chunker=splitter, embed_model=model, data=df[df['type']=='recruitment']))
collection['academic_affairs'].insert_many(chunk_and_embed(chunker=splitter, embed_model=model, data=df[df['type']=='academic_affairs']))
collection['scholarship'].insert_many(chunk_and_embed(chunker=splitter, embed_model=model, data=df[df['type']=='scholarship']))
collection['events'].insert_many(chunk_and_embed(chunker=splitter, embed_model=model, data=df[df['type']=='events']))
collection['timetable_and_classes'].insert_many(chunk_and_embed(chunker=splitter, embed_model=model, data=df[df['type']=='timetable']))

InsertManyResult([ObjectId('66cd57552662c792372fe2e1'), ObjectId('66cd57552662c792372fe2e2'), ObjectId('66cd57552662c792372fe2e3'), ObjectId('66cd57552662c792372fe2e4'), ObjectId('66cd57552662c792372fe2e5'), ObjectId('66cd57552662c792372fe2e6'), ObjectId('66cd57552662c792372fe2e7'), ObjectId('66cd57552662c792372fe2e8')], acknowledged=True)

# Upload the student handbook(PDF) to MongoDB

## Connect to Mongo

In [11]:
import pymongo

connection_string = "mongodb+srv://machkiet2507:machvikiet@kietmach2507.wlsbm.mongodb.net/"

# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(connection_string)

db = mongo_client['rag_db']
collection = db["student_handbook"]

## Read pdf file to text

In [20]:
import pypdfium2 as pdfium
from glob import glob

paths = glob('./data/Sổ tay sinh viên/*')

In [21]:
index_mapping = {
        'title': '',
        'article': '',
        'embedding': [],
        'school_year': '2024',
        'in_effect': '2024',
        'created_at': '',
        'updated_at': '',
    }

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from ibm_watsonx_ai.foundation_models import Embeddings
import os
from environs import load_dotenv
load_dotenv('./data/.env')
my_credentials = {
  "url": "https://us-south.ml.cloud.ibm.com",
  "apikey": os.environ['WATSONX_APIKEY'],
}

# model_id = 'sentence-transformers/all-minilm-l12-v2'
model_id = 'intfloat/multilingual-e5-large'
gen_parms = None
project_id = os.environ['WATSONX_PROJECT_ID']
space_id = None
verify = False

# Set the truncate_input_tokens to a value that is equal to or less than the maximum allowed tokens for the embedding model that you are using. If you don't specify this value and the input has more tokens than the model can process, an error is generated.

embed_params = {
  EmbedParams.TRUNCATE_INPUT_TOKENS: 512,
}

model = Embeddings(
  model_id=model_id,
  credentials=my_credentials,
  params=embed_params,
  project_id=project_id,
  verify=verify
)

In [30]:
import copy
data = []
for path in paths:
    pdf = pdfium.PdfDocument(path)
    for i,page in enumerate(pdf):
        record = copy.deepcopy(index_mapping)
        record['title'] = path.split('\\')[-1].removesuffix('.pdf')
        record['article'] = page.get_textpage().get_text_range()
        record['embedding'] = model.embed_documents([record['title'] + '\n' + page.get_textpage().get_text_range()])[0]
        data.append(record)



In [31]:
data

[{'title': 'Khoa công nghệ thông tin',
  'article': 'KHOA \r\nCÔNG NGHỆ THÔNG TIN\r\nThông tin chung\r\ninfo@\x8at.hcmus.edu.vn\r\nHỗ trợ sinh viên: tlsv@\x8at.hcmus.edu.vn \r\nGiáo vụ: \r\no Chương trình Chuẩn: \r\ngiaovu@\x8at.hcmus.edu.vn \r\no Chương trình Chất lượng cao: \r\ngiaovu.clc@\x8at.hcmus.edu.vn\r\no Chương trình Tiên tiến: \r\ngiaovu@apcs.\x8atus.edu.vn\r\nCố vấn học tập: cvht@\x8at.hcmus.edu.vn \r\nChương trình đề án: ctdb@hcmus.edu.vn\r\n(028) 38 354 266 (Ext: 500)\r\n(028) 62 884 499 (Ext: 4000)\r\nVăn phòng khoa: phòng I.53, \r\n227 Nguyễn Văn Cừ, Q.5, TP.HCM\r\nwww.\x8at.hcmus.edu.vn\r\nBan Chủ nhiệm:\r\nTrưởng Khoa: TS. Đinh Bá Tiến\r\nPhó Trưởng Khoa: TS. Lâm Quang Vũ, \r\nPGS.TS. Nguyễn Văn Vũ, ThS. Văn Chí Nam \r\nMỤC TIÊU ĐÀO TẠO\r\nSỨ MỆNH\r\n- Cung cấp các trải nghiệm giảng dạy và học \r\ntập hàng đầu cho các chương trình đào tạo \r\nbậc đại học và sau đại học trong lĩnh vực máy \r\ntính và công nghệ thông tin.\r\n- Đào tạo sinh viên, học viên trở thành những

In [32]:
collection.insert_many(data)

InsertManyResult([ObjectId('66cd76dc60051e749c3fd76a'), ObjectId('66cd76dc60051e749c3fd76b'), ObjectId('66cd76dc60051e749c3fd76c'), ObjectId('66cd76dc60051e749c3fd76d'), ObjectId('66cd76dc60051e749c3fd76e'), ObjectId('66cd76dc60051e749c3fd76f'), ObjectId('66cd76dc60051e749c3fd770'), ObjectId('66cd76dc60051e749c3fd771'), ObjectId('66cd76dc60051e749c3fd772'), ObjectId('66cd76dc60051e749c3fd773'), ObjectId('66cd76dc60051e749c3fd774'), ObjectId('66cd76dc60051e749c3fd775'), ObjectId('66cd76dc60051e749c3fd776'), ObjectId('66cd76dc60051e749c3fd777'), ObjectId('66cd76dc60051e749c3fd778'), ObjectId('66cd76dc60051e749c3fd779'), ObjectId('66cd76dc60051e749c3fd77a'), ObjectId('66cd76dc60051e749c3fd77b'), ObjectId('66cd76dc60051e749c3fd77c'), ObjectId('66cd76dc60051e749c3fd77d'), ObjectId('66cd76dc60051e749c3fd77e'), ObjectId('66cd76dc60051e749c3fd77f'), ObjectId('66cd76dc60051e749c3fd780'), ObjectId('66cd76dc60051e749c3fd781'), ObjectId('66cd76dc60051e749c3fd782'), ObjectId('66cd76dc60051e749c3fd7