Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
0082e26
Added BERT embedding model and changed from Gemini to BERT
rishikamtamneu Oct 23, 2025
3323f63
Merge branch 'main' of https://github.com/GenerateNU/cortex-etl-sourc…
rishikamtamneu Oct 24, 2025
5fc3298
Merge branch 'main' of https://github.com/GenerateNU/cortex-etl-sourc…
rishikamtamneu Oct 26, 2025
42b0302
Ignore .DS_Store files
rishikamtamneu Oct 26, 2025
d0716d3
HDB Scan and LLM logic testing implemented
rishikamtamneu Oct 26, 2025
e1f1552
Create classification logic implemented for 3 or more files in a cluster
rishikamtamneu Oct 27, 2025
4407e41
Create Classifications finished
rishikamtamneu Oct 27, 2025
b70c61d
Finished creating classifications and classifying files
rishikamtamneu Oct 28, 2025
8bea2ee
Combined the two cluster loops into a single loop that does both naam…
eitan-berenfeld Nov 3, 2025
37ddde8
Removed the length limit on the text embedding. Removed an unused dic…
eitan-berenfeld Nov 3, 2025
21e41ad
Addressing some of the Classification Test pull request
eitan-berenfeld Nov 7, 2025
cc61bd7
More PR changes
eitan-berenfeld Nov 7, 2025
0e4f874
Supabase updates
eitan-berenfeld Nov 10, 2025
7c5083c
Merge branch 'main' into classification_test
Nov 10, 2025
c6baddb
fix classification route
Nov 11, 2025
d6eeb26
wip debugging frontend
bamarler Oct 29, 2025
503a768
Merge branch 'main' into classification_test
bamarler Nov 11, 2025
9155b77
backend linting
bamarler Nov 11, 2025
8dd0bd1
fixed buggy auth again ig?
bamarler Nov 11, 2025
a9292fa
added eslint disable for useeffect dependency array
bamarler Nov 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ docker-compose.override.yml

# Supabase
.env.supabase.local
kong.yml
kong.yml
22 changes: 21 additions & 1 deletion backend/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
FROM python:3.13.9-slim

WORKDIR /app

# Install system dependencies needed to build hdbscan, scipy, and support cosine distance
RUN apt-get update && apt-get install -y \
build-essential \
python3-dev \
gfortran \
libopenblas-dev \
liblapack-dev \
&& rm -rf /var/lib/apt/lists/*

# Upgrade pip
RUN pip install --upgrade pip

# Copy requirements and install Python dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of the code
COPY . .

EXPOSE 8000

# Run the app with reload enabled
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
3 changes: 3 additions & 0 deletions backend/app/core/litellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class EmbeddingModelType(Enum):
OPENAI_SMALL = "text-embedding-3-small" # 1536 dimensions
OPENAI_LARGE = "text-embedding-3-large" # 3072 dimensions

# BERT models
# MODERNBERT_EMBED_BASE = "modernbert-embed-base" # 768 dimensions


class LLMClient:
"""Simplified LLM client for agentic workflows."""
Expand Down
2 changes: 1 addition & 1 deletion backend/app/routes/classification_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ async def create_classifications(
)

classification_names: list[str] = await create_classifications_helper(
tenant_id,
extracted_files,
[classification.name for classification in initial_classifications],
)

Expand Down
78 changes: 75 additions & 3 deletions backend/app/utils/classification/classify_files.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import numpy as np

from app.core.litellm import LLMClient
from app.schemas.classification_schemas import Classification, ExtractedFile


Expand All @@ -6,8 +9,77 @@ async def classify_files(
classifications: list[Classification],
) -> list[ExtractedFile]:
"""
Anlyzes all of the extracted files and the classifications
to classify each file, returing the classified extracted files
Classifies extracted files by comparing their embeddings to
classification name embeddings.

Each file is assigned the classification with the highest cosine similarity.
"""
# TODO: Implement the logic that analyzes each file and assigns classifications, returning the updated list

if not extracted_files or not classifications:
return extracted_files

client = LLMClient()
classification_embeddings = {}

# Generate embeddings for each classification name
print(f"Generating embeddings for {len(classifications)} classifications...")
for classification in classifications:
try:
emb = await client.embed(classification.name)
classification_embeddings[classification.classification_id] = emb
except Exception as e:
print(f"Error generating embedding for '{classification.name}': {e}")
continue

if not classification_embeddings:
print("No classification embeddings generated, returning files unchanged")
return extracted_files

# Assign best matching classification to each file
print(f"Classifying {len(extracted_files)} files using embedding similarity...")
for file in extracted_files:
if file.embedding is None or len(file.embedding) == 0:
print(f"File {file.name} has no embedding, skipping")
continue

best_similarity = -1
best_classification = None

for class_id, class_emb in classification_embeddings.items():
sim = _cosine_similarity(file.embedding, class_emb)
if sim > best_similarity:
best_similarity = sim
best_classification = next(
(c for c in classifications if c.classification_id == class_id),
None,
)

if best_classification:
file.classification = best_classification
print(
f" {file.name} → {best_classification.name} (similarity: {best_similarity:.3f})"
)

return extracted_files


def _cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
"""
Calculate cosine similarity between two vectors.

Returns a value between -1 and 1:
- 1 means vectors point in same direction (very similar)
- 0 means vectors are orthogonal (unrelated)
- -1 means vectors point in opposite directions (very different)
"""
v1 = np.array(vec1)
v2 = np.array(vec2)

dot_product = np.dot(v1, v2)
norm1 = np.linalg.norm(v1)
norm2 = np.linalg.norm(v2)

if norm1 == 0 or norm2 == 0:
return 0.0

return float(dot_product / (norm1 * norm2))
137 changes: 136 additions & 1 deletion backend/app/utils/classification/create_classifications.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import hdbscan
import numpy as np
from sklearn.preprocessing import normalize

from app.core.litellm import LLMClient
from app.schemas.classification_schemas import ExtractedFile


Expand All @@ -10,4 +15,134 @@ async def create_classifications(
to iteratively set new classifications returning the final result
"""
# TODO: Implement the logic that creates/edits classifications from the extracted files.
return ["Request For Quote", "Product Specification", "Purchase Order", "Quote"]

embeddings = []
valid_files = []

for file in extracted_files:
if file.embedding is not None and len(file.embedding) > 0:
embeddings.append(file.embedding)
valid_files.append(file)

if len(embeddings) < 3:
print(
f"Not enough files for clustering ({len(embeddings)}), returning initial classifications"
)
return initialClassifications

embeddings_array = np.array(embeddings)

# Normalize embeddings so that cosine similarity ≈ euclidean distance
normalized_embeddings = normalize(embeddings_array) # L2 normalization

clusterer = hdbscan.HDBSCAN(
min_cluster_size=2,
min_samples=1,
metric="euclidean",
cluster_selection_method="eom",
)

cluster_labels = clusterer.fit_predict(normalized_embeddings)
# HDBSCAN marks outliers with -1
_outlier_indices = np.where(cluster_labels == -1)[0]

clusters = {}

for i, label in enumerate(cluster_labels):
if label not in clusters:
clusters[label] = []
clusters[label].append(valid_files[i])

outliers = clusters.pop(-1, []) # Remove -1 cluster if it exists
print(f"Found {len(clusters)} clusters, {len(outliers)} outliers")

client = LLMClient()
classification_names = []

for cluster_id, files_in_cluster in clusters.items():
print(f"Analyzing cluster {cluster_id} with {len(files_in_cluster)} files...")

# Get sample documents from cluster (up to 5 for context)
sample_texts = []
for file in files_in_cluster[:5]:
text = _extract_text_from_file(file)
sample_texts.append(text[:500]) # Limit text length

# Use LLM to name the cluster
prompt = f"""Analyze these similar documents and provide a single, concise classification name.

Sample documents from this cluster:

{chr(10).join(f"Document {i + 1}: {text}" for i, text in enumerate(sample_texts))}

What type of documents are these? Respond with ONLY the category name.
Do not include any explanation or punctuation."""

try:
response = await client.chat(prompt, temperature=0.3, max_tokens=50)
category_name = response.choices[0].message.content.strip()
if not category_name:
category_name = f"Document Type {cluster_id}"
except Exception as e:
print(f" → Error generating name: {e}")
category_name = f"Document Type {cluster_id}"

classification_names.append(category_name)
print(f" → Named: {category_name}")

# Handle outliers individually
for i, file in enumerate(outliers):
print(f"Analyzing outlier {i} (single file)...")
text = _extract_text_from_file(file)
prompt = f"""Analyze this document and provide a concise classification name.

Document:

{text}

Respond with ONLY the category name."""

try:
response = await client.chat(prompt, temperature=0.3, max_tokens=50)
category_name = response.choices[0].message.content.strip()
if category_name:
classification_names.append(category_name)
print(f" → Outlier named: {category_name}")
else:
fallback_name = f"Document Type Outlier {i}"
classification_names.append(fallback_name)
print(f" → Outlier named: {fallback_name}")
except Exception as e:
print(f" → Error naming outlier: {e}")
fallback_name = f"Document Type Outlier {i}"
print(f" → Outlier named: {fallback_name}")

all_classifications = classification_names + initialClassifications
final_classifications = list(set(all_classifications))

print(f"Final classifications: {final_classifications}")
return final_classifications


def _extract_text_from_file(file: ExtractedFile) -> str:
"""Convert extracted file to text representation for analysis."""
parts = []

# Add filename
if file.name:
parts.append(f"Filename: {file.name}")

# Add extracted content
if isinstance(file.extracted_data, dict):
for key, value in file.extracted_data.items():
if isinstance(value, dict | list):
continue
parts.append(f"{key}: {value}")
elif isinstance(file.extracted_data, list):
parts.append(
f"Items: {', '.join(str(item) for item in file.extracted_data[:5])}"
)
else:
parts.append(str(file.extracted_data))

return " ".join(parts)
3 changes: 3 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ umap-learn==0.5.7
scikit-learn==1.5.2
numpy==2.1.3

# Classification
hdbscan>=0.8.33

# docling==2.55.1
# docling-core==2.48.4
# docling-ibm-models==3.9.1
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/config/supabase.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import type { Database } from '../types/database.types'
const supabaseUrl = import.meta.env.VITE_SUPABASE_URL
const supabaseAnonKey = import.meta.env.VITE_SUPABASE_ANON_KEY

console.log('supabaseURL:', supabaseUrl)
console.log('supabaseAnonKey:', supabaseAnonKey)

if (!supabaseUrl || !supabaseAnonKey) {
throw new Error('Missing Supabase environment variables')
}
Expand Down
1 change: 1 addition & 0 deletions frontend/src/contexts/AuthContext.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ export function AuthProvider({ children }: { children: React.ReactNode }) {
subscription.unsubscribe()
}
}
/* eslint-disable-next-line react-hooks/exhaustive-deps */
}, [])

const value: AuthContextType = {
Expand Down
38 changes: 1 addition & 37 deletions frontend/src/types/database.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ export type Database = {
}
l2_normalize: {
Args: { '': string } | { '': unknown } | { '': unknown }
Returns: unknown
Returns: string
}
match_documents: {
Args: {
Expand All @@ -299,46 +299,10 @@ export type Database = {
source_file_id: string
}[]
}
sparsevec_out: {
Args: { '': unknown }
Returns: unknown
}
sparsevec_send: {
Args: { '': unknown }
Returns: string
}
sparsevec_typmod_in: {
Args: { '': unknown[] }
Returns: number
}
update_webhook_config: {
Args: { secret: string; url: string }
Returns: undefined
}
vector_avg: {
Args: { '': number[] }
Returns: string
}
vector_dims: {
Args: { '': string } | { '': unknown }
Returns: number
}
vector_norm: {
Args: { '': string }
Returns: number
}
vector_out: {
Args: { '': string }
Returns: unknown
}
vector_send: {
Args: { '': string }
Returns: string
}
vector_typmod_in: {
Args: { '': unknown[] }
Returns: number
}
}
Enums: {
extraction_status: 'queued' | 'processing' | 'completed' | 'failed'
Expand Down
8 changes: 4 additions & 4 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading