In [1]:
from langchain_ollama import OllamaLLM

model = OllamaLLM(model='deepseekmini')

In [2]:


# class ContrieverEmbeddings(Embeddings):
#     def __init__(self, model_name="facebook/contriever-msmarco", device="cpu"):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModel.from_pretrained(model_name).to(device)
#         self.device = device

#     def mean_pooling(self, token_embeddings, mask):
#         # Zero out padded tokens
#         token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
#         # Mean pooling
#         sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
#         return sentence_embeddings

#     def embed_documents(self, texts):
#         return [self._embed(text) for text in texts]

#     def embed_query(self, text):
#         return self._embed(text)

#     def _embed(self, text):
#         inputs = self.tokenizer(
#             text, 
#             return_tensors="pt", 
#             truncation=True, 
#             padding=True
#         ).to(self.device)

#         with torch.no_grad():
#             outputs = self.model(**inputs)

#         embedding = self.mean_pooling(outputs[0], inputs["attention_mask"])
#         return embedding[0].cpu().numpy().tolist()


In [3]:
from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel
import torch

class MiniLM(Embeddings):
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", device="cpu"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).to(device)
        self.device = device

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def embed_documents(self, texts):
        return [self._embed(text) for text in texts]

    def embed_query(self, text):
        return self._embed(text)

    def _embed(self, text):
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True
        ).to(self.device)

        with torch.no_grad():
            model_output = self.model(**inputs)

        embedding = self.mean_pooling(model_output, inputs["attention_mask"])
        return embedding[0].cpu().numpy().tolist()

  from scipy.sparse import csr_matrix, issparse


In [4]:
embedding_fn = MiniLM(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    device="cuda"  # or "cpu"
)

In [6]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from qdrant_client import QdrantClient

# Local Qdrant
client = QdrantClient(url="http://localhost:6333")

# Create collection
# client.create_collection(
#     collection_name="wikipedia",
#     vectors_config=VectorParams(size=768, distance=Distance.COSINE),
# )

# Create embedding object
# embedding_fn = ContrieverEmbeddings(device="cuda")  # or "cuda"

# Initialize vector store
qdrant = QdrantVectorStore(
    client=client,
    collection_name="wikipedia",
    embedding=embedding_fn,
    retrieval_mode=RetrievalMode.DENSE,
)

In [7]:
# from langchain.schema import Document
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# import json
# import os

# documents = []
# path = r'D:\Life\Academic\Skripsi\Code\data\wikipedia2corpus\data\enwiki-cleaned'
# last_processed_file_path = r'D:\Life\Academic\Skripsi\Code\data\wikipedia2corpus\last_processed.txt'

# splitter = RecursiveCharacterTextSplitter(
#     chunk_size=512,     # max characters per chunk
#     chunk_overlap=128    # repeated text between chunks for context
# )

# # Read last processed filename from txt
# if os.path.exists(last_processed_file_path):
#     with open(last_processed_file_path, "r") as f:
#         last_processed_filename = f.read().strip()
# else:
#     last_processed_filename = ""

# skip = True if last_processed_filename else False
# files_processed = 0
# max_files = 1
# last_filename_this_run = None

# if not os.path.exists(path):
#     print(f"Directory does not exist: {path}")
# else:
#     try:
#         for filename in sorted(os.listdir(path)):
#             if skip:
#                 if filename == last_processed_filename:
#                     skip = False
#                 continue  # Skip until after the last processed file

#             file_path = os.path.join(path, filename)
#             print(f"Processing file: {filename}")

#             with open(file_path, "r", encoding="utf-8") as f:
#                 for line in f:
#                     data = json.loads(line)
                    
#                     # Split the text into chunks
#                     chunks = splitter.split_text(data["text"])
                    
#                     # Each chunk becomes a separate document with same metadata
#                     for idx, chunk in enumerate(chunks):
#                         documents.append(Document(
#                             page_content=chunk,
#                             metadata={
#                                 "title": data["title"],
#                                 "chunk_id": idx  # optional for debugging
#                             }
#                         ))

#             files_processed += 1
#             last_filename_this_run = filename

#             if files_processed >= max_files:
#                 break

#         if documents:
#             qdrant.add_documents(documents=documents)
#             print(f"Processed {len(documents)} chunks successfully")

#             # Update last processed filename
#             if last_filename_this_run:
#                 with open(last_processed_file_path, "w") as f:
#                     f.write(last_filename_this_run)
#                 print(f"Updated last processed filename to: {last_filename_this_run}")
#         else:
#             print("No new files to process.")

#     except Exception as e:
#         print(f"Error occurred: {str(e)}")


In [8]:
# from langchain.schema import Document
# import json
# import os

# documents = []
# path = r'D:\Life\Academic\Skripsi\Code\wikipedia2corpus\data\enwiki-cleaned'
# last_processed_file_path = r'D:\Life\Academic\Skripsi\Code\wikipedia2corpus\last_processed.txt'

# # Read last processed filename from txt
# if os.path.exists(last_processed_file_path):
#     with open(last_processed_file_path, "r") as f:
#         last_processed_filename = f.read().strip()
# else:
#     last_processed_filename = ""

# skip = True if last_processed_filename else False
# files_processed = 0
# max_files = 3
# last_filename_this_run = None

# if not os.path.exists(path):
#     print(f"Directory does not exist: {path}")
# else:
#     try:
#         for filename in sorted(os.listdir(path)):
#             if skip:
#                 if filename == last_processed_filename:
#                     skip = False
#                 continue  # Skip this file (including the last processed one)
#             file_path = os.path.join(path, filename)
#             print(f"Processing file: {filename}")
#             with open(file_path, "r", encoding="utf-8") as f:
#                 for line in f:
#                     data = json.loads(line)
#                     documents.append(Document(
#                         page_content=data["text"], 
#                         metadata={"title": data["title"]}
#                     ))
#             files_processed += 1
#             last_filename_this_run = filename
#             if files_processed >= max_files:
#                 break
#         if documents:
#             qdrant.add_documents(documents=documents)
#             print(f"Processed {len(documents)} documents successfully")
#             # Update last processed filename
#             if last_filename_this_run:
#                 with open(last_processed_file_path, "w") as f:
#                     f.write(last_filename_this_run)
#                 print(f"Updated last processed filename to: {last_filename_this_run}")
#         else:
#             print("No new files to process.")
#     except Exception as e:
#         print(f"Error occurred: {str(e)}")

In [9]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Create a prompt template that includes context
prompt_template = """Use the following pieces of context to answer the question. If you don't know the answer based on the context, just say you don't know.

Context: {context}

Question: {question}

Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type="stuff",
    retriever=qdrant.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [10]:
query = "whats nga taonga sound?"
result = qa_chain.invoke({"query": query})

print(f"{result['result']}")

 Nga taonga refer to traditional instruments from various cultures including Māori in New Zealand, Fula people in West Africa, Gabonese culture, and so on. Each culture has its unique musical traditions with their respective traditional instruments. The context provided does not provide specific information about what nga taonga sound like or are called since it is broadened to a larger array of music genres such as ngoma from Tanzania, Taarab in Zanzibar, Kwaya choir music from Tanzania, and Gumbe folk styles from various countries including the sounds produced by traditional instruments used in these music forms. These musical traditions play an important role in conveying stories, religious customs and daily life routines which hold cultural significance for their respective communities.


In [11]:
found_docs = qdrant.similarity_search(query, k=5)
for doc in found_docs:
    print(doc)

page_content='Traditional Māori instruments are taonga pūoro. They fulfilled various roles including storytelling, religious traditions and also daily functions such as the beginning of a new day. Taonga pūoro fall into two areas, melodic instruments such as the flute and rhythmic instruments such as poi "balls of dried flax on string that are swung and tapped".' metadata={'_id': 250294, '_collection_name': 'wikipedia'}
page_content='Ngoma "(Bantu, meaning dance, drum and event)" is a traditional dance music that has been the most widespread music in Tanzania. Dansi is urban jazz or band music. Taarab is sung Kiswahili poetry accompanied by a band, typically string, in which audience is often, but not always, encouraged to dance and clap. Kwaya is choir music originally limited to church during colonization, but now an secular part of education, social, and political events.' metadata={'_id': 103812, '_collection_name': 'wikipedia'}
page_content='The Fula have a rich musical culture an

In [12]:
model.invoke(query)

' Nga taonga is a Māori term that means "treasures." The phrase "Nga taonga sound" refers to the diverse collection of sounds, music, and audio content produced by humans or natural phenomena. \n\nIn this context, "Nga taonga sound" could refer to various aspects such as:\n\n1. Music: Any form of auditory art that is appreciated for its beauty and expressive power can be considered a Nga taonga sound. It includes traditional Maori chants, modern pop songs, classical compositions, or any other kind of music.\n\n2. Nature Sounds: The natural sounds present on Earth are another type of Nga taonga sound. These include things like rain falling, wind blowing through trees, waves crashing against the shoreline, bird calls, insect buzzing and so on. They create a calming effect for many people who listen to them. \n\n3. Cultural Sounds: It could also refer to sounds from different cultures or ethnic groups which might be important symbols of those groups\' identity such as traditional stories 