In [19]:
import sys
import os
# Adjust this path to point to the root of your project
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../"))  # or wherever `src/` is
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

from dotenv import load_dotenv
load_dotenv(dotenv_path="/home/mangabat/projects/portofolio/backend/services/service_cover_letter/.env")  # or adjust the relative path


from src.config.config_db_connections import PostgressConnection

PostgressConnection.initialize()

for item in PostgressConnection.Base.metadata.tables:
    print(item)
    

print(f"Adding {PROJECT_ROOT} to sys.path")


Adding /home/mangabat/projects/portofolio/backend/services/service_cover_letter/src to sys.path


In [20]:
import uuid
from io import BytesIO
from pathlib import Path
from typing import Dict, Any, Optional, Tuple, List

from dotenv import load_dotenv

# Load .env manually
def get_env_path() -> str:
    return os.path.join(PROJECT_ROOT, '.env')

env_path = get_env_path()
load_dotenv(dotenv_path=env_path)



# Sanity check
qhost = os.environ["QDRANT_HOST"] = "localhost"
qport = os.environ["QDRANT_PORT"] = "6333"
print("✅ QDRANT_HOST =", os.getenv("QDRANT_HOST"))
print("✅ QDRANT_PORT =", os.getenv("QDRANT_PORT"))
print(env_path)

✅ QDRANT_HOST = localhost
✅ QDRANT_PORT = 6333
/home/mangabat/projects/portofolio/backend/services/service_cover_letter/src/.env


In [21]:
from io import BytesIO

# Simulate the uploaded file manually
file_path: str = r"JBach_veovo.pdf"
with open(file_path, "rb") as f:
    byte_stream: BytesIO = BytesIO(f.read())

original_file_name: str = "JBach_veovo.pdf"
print("✅ Loaded file and converted to BytesIO.")


✅ Loaded file and converted to BytesIO.


In [22]:
from src.service_layer.text_extractor import FileTextExtractor

extracted_text: str = FileTextExtractor.extract_text(byte_stream, original_file_name)

print("🧾 Extracted Text Preview:\n" + extracted_text[:500] + "\n...")  # preview only


🧾 Extracted Text Preview:
Jannik M. B. Sørensen  Mail: Mangabat93@gmail.com   TLF-nr: +4526335065  
Er dansk talende og af D ansk herkomst.  
Flytter gerne så tæt på som muligt  Jannik Mangabat Bach Sørensen   Denmark • +4526335065 mangabat93@gmail.com • LinkedIn  
Qualifications Summary  
Diligent and analytical graduate with experience in IT projects, using problem -solving skills and structured 
approaches to improve project outcomes. Highly motivated to develop machine learning models and AI/ML solutions 
that drive 
...


In [23]:
import uuid

file_id: str = str(uuid.uuid4())
metadata: dict = {
    "file_name": "jb_veovo_test.pdf",
    "original_file_name": original_file_name,
    "bucket": "uploaded-cover-letters",  # your configured bucket
    "file_type": "application/pdf"
}
print(f"🆔 Using file_id: {file_id}")


🆔 Using file_id: 1d1431af-1482-43b1-9c1c-d7f5b255a706


In [None]:
from qdrant_client import QdrantClient, models
from qdrant_client.models import PointStruct, VectorParams, Distance, Filter, FieldCondition, MatchValue, ScoredPoint


class QdrantConnection:
    def __init__(self) -> None:
        # self.port: int = config.QDRANT_PORT
        self.default_collection: str = "embedded_cover_letters"

        self.client = QdrantClient(url= "http://localhost:6333")

        self._ensure_collection_exists()

    def _ensure_collection_exists(self) -> None:
        """
        Check if the default collection exists in Qdrant.
        If not, create it with appropriate vector configuration.
        """
        existing_collections = [c.name for c in self.client.get_collections().collections]

        if self.default_collection not in existing_collections:
            self.client.create_collection(
                collection_name=self.default_collection,
                vectors_config=models.VectorParams(
                    size=768,  # required for sentence-transformers
                    distance=models.Distance.COSINE
                )
            )
            print(f"✅ Created collection: {self.default_collection}")
        else:
            print(f"✅ Collection already exists: {self.default_collection}")




In [None]:
from sentence_transformers import SentenceTransformer
import logging

from datetime import datetime

class QdrantCoverLetterRepository:
    def __init__(self, connection: QdrantConnection) -> None:
        """
        Initialize Qdrant repository for cover letter embeddings.

        Args:
            connection (QdrantConnection): Shared client and config for Qdrant.
        """
        self.client: QdrantClient = connection.client
        self.collection_name: str = connection.default_collection

        # SentenceTransformer instance directly
        self.embedding_model: SentenceTransformer = SentenceTransformer(
            "sentence-transformers/all-mpnet-base-v2"
        )

    def upsert_file_embedding(self, file_id: str, text: str, metadata: Optional[Dict[str, Any]] = None) -> None:
        """
        Embed file content and upsert it into Qdrant.

        Args:
            file_id (str): Unique file identifier (UUID-based).
            text (str): Plain text content extracted from the file.
            metadata (Optional[Dict[str, Any]]): Additional traceable metadata.
        """
        logging.info(f"inside upsert_file_embedding, FILE: {__file__}")
        vector: List[float] = self.embedding_model.encode(text).tolist()
        print(f"✅ Vector length: {(vector)}\n")
        payload: Dict[str, Any] = {
            "file_id": file_id,
            "uuid": str(uuid.uuid4()),
            "type": "cover_letter",
            "timestamp": datetime.now().isoformat(),
        }
        print(f"✅ Payload: {payload}\n")
        if metadata:
            payload.update(metadata)

        # Ensure collection exists
        if self.collection_name not in [c.name for c in self.client.get_collections().collections]:
            self.client.recreate_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(size=len(vector), distance=Distance.COSINE),
            )

        point = PointStruct(id=file_id, vector=vector, payload=payload)
        print(f"✅ Point: {point}\n")
        self.client.upsert(collection_name=self.collection_name, points=[point])
        logging.info(f"Upserted point with ID {file_id} into Qdrant collection {self.collection_name }")


    def search_similar_documents(self, query: str, k: int = 3, threshold: float = 0.6) -> Optional[Tuple[Dict[str, Any], float]]:
        """
        Perform a semantic search in Qdrant for documents similar to the query.

        Args:
            query (str): The input text to search for.
            k (int): Number of top results to retrieve.
            threshold (float): Minimum similarity score threshold.

        Returns:
            Optional[Tuple[Dict[str, Any], float]]: Best match payload and similarity score.
        """
        vector: List[float] = self.embedding_model.encode(query).tolist()

        search_result: List[ScoredPoint] = self.client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            limit=k,
            score_threshold=threshold
        )

        if not search_result:
            return None

        best_match = search_result[0]
        return best_match.payload, best_match.score

    def delete_vector_by_file_id(self, file_id: str) -> None:
        """
        Delete all vectors in Qdrant associated with a given file_id.

        Args:
            file_id (str): The file identifier whose vectors should be removed.
        """
        self.client.delete(
            collection_name=self.collection_name,
            points_selector=Filter(
                must=[
                    FieldCondition(key="file_id", match=MatchValue(value=file_id))
                ]
            )
        )

In [None]:

# Setup Qdrant connection and repo
qdrant_connection = QdrantConnection()
qdrant_repo = QdrantCoverLetterRepository(connection=qdrant_connection)

# Run the upsert (this encodes text and sends to Qdrant)
qdrant_repo.upsert_file_embedding(
    file_id=file_id,
    text=extracted_text,
    metadata=metadata
)
print("✅ Upserted vector to Qdrant successfully.")



✅ Collection already exists: embedded_cover_letters
✅ Vector length: [0.017810482531785965, 0.05787220597267151, -0.052390340715646744, -0.012726382352411747, -0.028267525136470795, -0.02676377259194851, 0.032146669924259186, -0.027865955606102943, -0.03273410722613335, -0.013200273737311363, 0.0852893516421318, 0.07046909630298615, 0.004385209642350674, 0.0981484055519104, 0.035450901836156845, -0.04032798483967781, 0.02198118157684803, 0.01453424897044897, -0.04204535111784935, -0.0020605067256838083, -0.03198806196451187, 0.013844764791429043, -0.01066471729427576, 0.05257558822631836, -0.012561258859932423, -0.0037653979379683733, -0.035562366247177124, 0.00832306407392025, 0.04165938124060631, -0.013842319138348103, 0.030187217518687248, -0.05475716292858124, 0.0002809555735439062, 0.04492047801613808, 2.380364549026126e-06, -0.02421410195529461, -0.041005708277225494, 0.004060534294694662, -0.009884512983262539, 0.030891111120581627, 0.042581651359796524, 0.003419764107093215, -0

In [24]:
from src.service_layer.file_service import FileService
from src.data_models.minio_models import FileItem
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File
from src.config.config_db_connections import MiniOConnection, QdrantConnection
from src.data_repositories.miniO_repository.CRUD_minio import MinioRepository
from src.data_repositories.qdrant_repository.CRUD_qdrant import QdrantCoverLetterRepository

router = APIRouter()

def get_file_service() -> FileService:
    minio_connection = MiniOConnection.get_minio_connection()
    repository = MinioRepository(minio_connection)
    return FileService(repository)

async def upload_files(
    files: List[UploadFile] = File(...),
    file_service: FileService = Depends(get_file_service)
) -> List[FileItem]:
    """
    Uploads files, saves them to MinIO, extracts text, embeds them in Qdrant (no Kafka).
    """
    try:
        logging.info(f"{__file__} | 📥 Received files: {[file.filename for file in files]}")
        # await ensure, that the files will not be uploaded before other operations are made
        minio_responses: List[FileItem] = await file_service.process_files(files)
        
        qdrant_connection = QdrantConnection()
        qdrant_repo = QdrantCoverLetterRepository(qdrant_connection)

        # If i read this correctly, what is the point of doing zip here? 
        for file_item, raw_file in zip(minio_responses, files):
            logging.info(f"{__file__} | 🔍 Starting embedding for file_id={file_item.file_id}")

            await raw_file.seek(0)  # rewind to read again
            byte_stream: BytesIO = BytesIO(await raw_file.read())

            extracted_text = FileTextExtractor.extract_text(byte_stream, file_item.original_file_name)

            if not extracted_text:
                logging.warning(f"{__file__} | ⚠️ No text extracted for {file_item.original_file_name}")
                continue

            qdrant_repo.upsert_file_embedding(
                file_id=file_item.file_id,
                text=extracted_text,
                metadata={
                    "file_name": file_item.file_name,
                    "original_file_name": file_item.original_file_name,
                    "bucket": file_item.bucket,
                    "file_type": file_item.file_type
                }
            )
            logging.info(f"{__file__} | ✅ Embedded file_id={file_item.file_id}")

        return minio_responses

    except Exception as e:
        logging.error(f"{__file__} | ❌ Error in upload_files: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail="File upload and embedding failed.")


  from .autonotebook import tqdm as notebook_tqdm
