# Load Dataset

In [5]:
!python3 -m pip install --quiet datasets pandas nomic sentence-transformers einops pymongo


[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
from datasets import load_dataset
import pandas as pd
# Load the dataset without downloading it fully
data = load_dataset("MongoDB/cosmopedia-wikihow-chunked", split="train", streaming=True)
data_head = data.take(100)
# Create the DataFrame
df = pd.DataFrame(data_head)
# Only keep entries where the text field is not null
df = df[df["text"].notna()]
# Preview contents of the data
df.head()

Unnamed: 0,doc_id,chunk_id,text_token_length,text
0,0,0,180,Title: How to Create and Maintain a Compost Pi...
1,0,1,141,**Step 2: Gather Materials**\nGather brown (ca...
2,0,2,182,_Key guideline:_ For every volume of green mat...
3,0,3,188,_Key tip:_ Chop large items like branches and ...
4,0,4,157,**Step 7: Maturation and Use**\nAfter 3-4 mont...


# Create vector embeddings 

In [None]:
from nomic import embed
from sentence_transformers import SentenceTransformer

# Load the embedding model (https://huggingface.co/nomic-ai/nomic-embed-text-v1")
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)

def get_embedding(text):
   """Generates vector embeddings for the given text."""

   embedding = model.encode(text)
   return embedding.tolist()

# Creates embeddings and stores them as a new field
df["text_embedding"] = df["text"].apply(get_embedding)
df.head()

# Connecto to MongoDB

In [6]:
import pymongo

connection_string = "mongodb+srv://machkiet2507:machvikiet@kietmach2507.wlsbm.mongodb.net/"

# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(connection_string)

# Create collections to MongoDB

In [26]:
index_mapping = {
    'title': ['[CQ] Nghỉ học lớp Lý thuyết số 22_6 ngày 29/3/2024'],
    'article': ['THÔNG BÁO Lớp Lý thuyết số 22_6, lịch học T6 1-4, phòng E304 sẽ nghỉ học 1 buổi ngày 29/3/2024 do Giảng viên bận công tác.GIÁO VỤ KHOA CNTT'],
    'embedding': [[]],
    'school_year': ['2024'],
    'in_effect': ['2024'],
    'file_links': [''],
    'page_number' : [''],
    'created_at': [''],
    'updated_at': [''],
}
df = pd.DataFrame(index_mapping)

# Ingest data into Atlas
db = mongo_client["rag_db"]   # tên db thì giữ nguyên
collection = {}
collection['student_handbook'] = db["student_handbook"]   
collection['recruitment_and_internship_program'] = db["recruitment_and_internship_program"]   
collection['timetable_and_classes'] = db["timetable_and_classes"]   
collection['academic_affairs'] = db["academic_affairs"]   
collection['scholarship'] = db["scholarship"]   
collection['events'] = db["events"] 


 


In [28]:
documents = df.to_dict("records")
collection['student_handbook'].insert_many(documents)
print("Collections in database:", db.list_collection_names())

Collections in database: ['student_handbook']
