In [None]:
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import json
from sqlalchemy import create_engine
from sqlalchemy import text
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from InstructorEmbedding import INSTRUCTOR
import uuid
import numpy as np  
from sklearn.decomposition import PCA
import umap
import plotly.figure_factory as ff



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
load_dotenv()

DB_URL = "postgresql://postgres:pgAdmin@localhost:5432/Dump"
COLLECTION_NAME = "math_embeddings"
QDRANT_HOST = "localhost"
QDRANT_PORT = 6333

In [None]:
# engine = create_engine(DB_URL)
# model = SentenceTransformer("BAAI/bge-m3") 

# client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

engine = create_engine(DB_URL)
model = SentenceTransformer('hkunlp/instructor-xl')
 
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [None]:
def get_all_metadata():
    metadata_list = []
    scroll_offset = None

    while True:
        result = client.scroll(
            collection_name=COLLECTION_NAME,
            with_vectors=False,
            with_payload=True,  
            offset=scroll_offset,
            limit=100
        )

        points, scroll_offset = result

        for point in points:
            if point.payload:
                metadata_list.append(point.payload)

        if scroll_offset is None:
            break 

    return metadata_list


In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os


HIERARCHY_ORDER = ["subject", "domain", "skill", "subskill", "difficulty"]

def get_parent_keys(label_key):
    idx = HIERARCHY_ORDER.index(label_key)
    return HIERARCHY_ORDER[:idx]

def get_unique_metadata_groups(metadata_list, group_keys):
    groups = set()
    for meta in metadata_list:
        key = tuple(meta[k] for k in group_keys)
        groups.add(key)
    return list(groups)

def filter_metadata_by_group(metadata_list, group_keys, group_values):
    return [m for m in metadata_list if all(m[k] == v for k, v in zip(group_keys, group_values))]

def get_vectors_for_label_in_group(full_filter_query: str, label_key: str, label_value: str, top_k: int = 50):
    query = f"{full_filter_query} AND {label_key}:{label_value}" if full_filter_query else f"{label_key}:{label_value}"
    return semantic_search_and_get_vector(query, top_k)
        

In [None]:
import pandas as pd

def get_vectors_and_metadata_as_dataframe(parent_keys, parent_values):
    """
    Fetch vectors and their metadata from Qdrant, then return a DataFrame with the vector and metadata as columns.
    """
    assert len(parent_keys) == len(parent_values), "Keys and values must be of same length"
    
    filter_conditions = [
        {"key": k, "match": {"value": v}} for k, v in zip(parent_keys, parent_values)
    ]

    vector_metadata_list = []  
    scroll_offset = None

    while True:
        result = client.scroll(
            collection_name=COLLECTION_NAME,
            with_vectors=True,
            with_payload=True,
            offset=scroll_offset,
            limit=100,
            scroll_filter={"must": filter_conditions}  
        )
        points, scroll_offset = result

        for point in points:
            vector = point.vector
            payload = point.payload

            if vector is not None and payload is not None:
                vector_metadata_list.append([vector, payload])

        if scroll_offset is None:
            break

    vector_data = []
    metadata_columns = []

    for vector, metadata in vector_metadata_list:
        flattened_metadata = {**metadata}

        vector_data.append([vector] + list(flattened_metadata.values()))  


        if not metadata_columns:
            metadata_columns = [key for key in flattened_metadata.keys()]

    num_vector_columns = len(vector_data[0])
    
    df = pd.DataFrame(vector_data, columns=["vector"] + metadata_columns)

    return df


In [None]:
import pandas as pd

def semantic_search_and_get_vector(user_query, top_k):
    query_vector = model.encode(user_query).tolist()

    search_result = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        with_vectors=True,
        with_payload=True,
        limit=top_k
    )

    vector_data = []
    metadata_columns = []

    for res in search_result:
        if res.vector is not None and res.payload is not None:
            flattened_metadata = {**res.payload}
            vector_data.append([res.vector] + list(flattened_metadata.values()))

            if not metadata_columns:
                metadata_columns = list(flattened_metadata.keys())

    if not vector_data:
        return pd.DataFrame(columns=["vector"] + metadata_columns)

    df = pd.DataFrame(vector_data, columns=["vector"] + metadata_columns)
    return df


In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
import os

def semantic_cluster_and_plot_by_label(df, label_key, group, parent_keys):

    labels = df[label_key].unique()
    group_semantic_df = []

    for each_label in labels:
        filtered_df = df[df[label_key] == each_label]
        group_name = "PID-" + "-".join(group) + f"-{each_label}"

        semantic_df = semantic_search_and_get_vector(
            user_query=group_name,
            top_k=len(filtered_df)
        )

        semantic_df['semantic_label'] = group_name

        # print(f"Label: {each_label}, Rows: {len(semantic_df)}")
        # print(semantic_df.head(), '\n')

        group_semantic_df.append(semantic_df)

    if not group_semantic_df:
        print("No semantic data collected.")
        return None

    full_df = pd.concat(group_semantic_df, ignore_index=True)
    group_semantic_search_df = pd.concat(group_semantic_df, ignore_index=True)


    if 'vector' not in full_df.columns:
        print("Skipping: 'vector' column missing.")
        return None

    vectors = full_df['vector'].tolist()
    vector_array = np.array(vectors)

    # pca = PCA(n_components=2)
    # reduced = pca.fit_transform(vector_array)
    n_neighbors = min(15, len(vector_array) - 1)
    reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=0.1, spread=1.0)
    reduced = reducer.fit_transform(vector_array)

    full_df['x'] = reduced[:, 0]
    full_df['y'] = reduced[:, 1]
    full_df.insert(0, 'S.No', range(1, len(full_df) + 1))

    group_name = "__".join([f"{k}-{v}" for k, v in zip(parent_keys, group)])
    os.makedirs("semantic-graph-analysis", exist_ok=True)

    csv_path = f"semantic-graph-analysis/{group_name}.csv"
    html_path = f"semantic-graph-analysis/{group_name}.html"

    full_df.to_csv(csv_path, index=False)

    fig = px.scatter(
        full_df, x='x', y='y',
        color='semantic_label',
        hover_data=['x', 'y'] + [col for col in full_df.columns if col not in ['x', 'y', 'vector']],
        title=f"Semantic space for group: {group_name.replace('__', ' | ')}"
    )
    fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='DarkSlateGrey')))
    fig.update_layout(
        width=900, height=700,
        title_x=0.5,
        legend_title_text='semantic_label',
    )
    fig.write_html(html_path)

    # kmeans_clustering_using_label_count(df, group_name, label_key, isSemanticSearch=True)
    # kmeans_clustering_using_label_count(df, group_name, 'semantic_label', isSemanticSearch=True)

    # return fig.to_html(full_html=False, include_plotlyjs='cdn')

    # Collect all HTML graph strings
    html_parts = []

    # # Call 1: regular label clustering
    # html_parts.append(kmeans_clustering_using_label_count(group_semantic_search_df, group_name, label_key, isSemanticSearch=True))

    # # Call 2: semantic label clustering
    # html_parts.append(kmeans_clustering_using_label_count(group_semantic_search_df, group_name, 'semantic_label', isSemanticSearch=True))

    # Assume there's a final `fig` created separately
    html_parts.append(fig.to_html(full_html=False, include_plotlyjs='cdn'))

    # Combine and return
    return '\n'.join(html_parts)

    




In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from kneed import KneeLocator
import plotly.graph_objs as go

def kmeans_clustering_with_elbow_and_plot(df, group_name, label_key):
    if 'vector' not in df.columns:
        print("Skipping: 'vector' column missing.")
        return

    vectors = np.array(df['vector'].tolist())

    inertia = []
    k_range = range(2, min(15, len(vectors)))  

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(vectors)
        inertia.append(kmeans.inertia_)

    knee = KneeLocator(k_range, inertia, curve='convex', direction='decreasing')
    optimal_k = knee.knee or 2  

    print(f"Optimal number of clusters (K): {optimal_k}")

    final_kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    df['cluster'] = final_kmeans.fit_predict(vectors)

    elbow_fig = go.Figure()
    elbow_fig.add_trace(go.Scatter(x=list(k_range), y=inertia, mode='lines+markers', name='Inertia'))
    elbow_fig.add_vline(x=optimal_k, line_dash='dash', line_color='green', annotation_text=f"K={optimal_k}", annotation_position="top right")
    elbow_fig.update_layout(
        title="Elbow Method For Optimal K",
        xaxis_title="Number of Clusters",
        yaxis_title="Inertia",
        width=600, height=400,
    )

    scatter_fig = px.scatter(
        df, x='x', y='y',
        color=df['cluster'].astype(str),
        hover_data=['x', 'y'] + [col for col in df.columns if col not in ['x', 'y', 'vector']],
        title=f"KMeans Clustering for Group: {group_name}"
    )

    scatter_fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='DarkSlateGrey')))
    scatter_fig.update_layout(width=900, height=700, title_x=0.5,legend_title="Cluster", coloraxis_showscale=False )

    combined_html_path = f"cluster-graph-analysis/{group_name}_kmeans.html"
    with open(combined_html_path, 'w') as f:
        f.write(elbow_fig.to_html(full_html=False, include_plotlyjs='cdn'))
        f.write(scatter_fig.to_html(full_html=False, include_plotlyjs=False))

    updated_csv_path = f"cluster-graph-analysis/{group_name}_kmeans.csv"
    df.to_csv(updated_csv_path, index=False)

    return elbow_fig.to_html(full_html=False, include_plotlyjs=False) + \
       scatter_fig.to_html(full_html=False, include_plotlyjs=False) , updated_csv_path

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import os
import umap

def kmeans_clustering_using_label_count(df, group_name, label_key, isSemanticSearch=False):
    if 'vector' not in df.columns:
        print("Skipping: 'vector' column missing.")
        return

    vectors = np.array(df['vector'].tolist())

    # Calculate number of unique labels
    unique_labels = df[label_key].dropna().unique()
    k = len(unique_labels)
    print(f"Number of unique labels in '{label_key}' column: {k}")

    # Perform KMeans with k = number of unique labels
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    df['cluster'] = kmeans.fit_predict(vectors)

    # PCA for 2D projection
    # pca = PCA(n_components=2)
    # reduced = pca.fit_transform(vectors)
    # n_neighbors = min(15, len(vectors) - 1)
    # reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=0.1, spread=1.0)
    # reduced = reducer.fit_transform(vectors)


    # df['x_cluster'] = reduced[:, 0]
    # df['y_cluster'] = reduced[:, 1]

    # Scatter plot with clusters
    scatter_fig = px.scatter(
        df, x='x', y='y',
        color=df['cluster'].astype(str),
        hover_data=['x', 'y'] + [col for col in df.columns if col not in ['x', 'y', 'vector']],
        title=f"KMeans Clustering using Label Count ({label_key})"
    )

    scatter_fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='DarkSlateGrey')))
    scatter_fig.update_layout(width=900, height=700, title_x=0.5, legend_title="Cluster", coloraxis_showscale=False)

    # Save HTML and CSV
    if(isSemanticSearch):
        os.makedirs("semantic-cluster-graph-analysis", exist_ok=True)
        html_path = f"semantic-cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.html"
        csv_path = f"semantic-cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.csv"
    else:
        os.makedirs("cluster-graph-analysis", exist_ok=True)
        html_path = f"cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.html"
        csv_path = f"cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.csv"

    df.to_csv(csv_path, index=False)

    # os.makedirs("cluster-graph-analysis", exist_ok=True)
    # html_path = f"cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.html"
    # df.to_csv(f"cluster-graph-analysis/{group_name}_{label_key}_labelcount_kmeans.csv", index=False)

    with open(html_path, 'w') as f:
        f.write(scatter_fig.to_html(full_html=True, include_plotlyjs='cdn'))

    return scatter_fig.to_html(full_html=False, include_plotlyjs=False), csv_path

In [None]:
import os
import pandas as pd

def append_visualization_section_to_global_html(
    title: str,
    metadata_html: str,
    cluster_html: str,
    lable_count_cluster_html: str,
    summary_df: pd.DataFrame,
    label_key: str,
    output_path: str = "reports/combined_analysis_report.html"
):
    # Generate styled HTML from DataFrame
    summary_df_html = summary_df.to_html(
        index=False,
        border=0,
        classes="styled-table",
        justify="center",
        escape=False
    )

    section_html = f"""
    <hr>
    <h2 style="text-align:center; color:#2c3e50;">{title}</h2>

    <h3>1. Metadata Visualization</h3>
    <p>
        This plot is generated by retrieving vector embeddings from the Qdrant database 
        and filtering the data purely based on metadata fields. 
        The exact filters used are reflected in the section title above.
        The high-dimensional vectors were reduced to 2D using UMAP to visualize how items cluster 
        based on their metadata.
    </p>
    {metadata_html}

    <h3>2. KMeans Clustering</h3>
    <p>
        We applied KMeans clustering to the same metadata-filtered data to find natural groupings.
    </p>
    <h4>2.1 Clustering with Chosen K</h4>
    <p>
        In this plot, we used a fixed number of clusters (K) based on what we expected. 
        This helps us understand how items group when we already have an idea of how many clusters we want.
    </p>
    {lable_count_cluster_html}

    <h3>3. {label_key.capitalize()} Accuracy Summary</h3>
    <p>This table summarizes {label_key}-wise clustering accuracy and distribution based on metadata and clustering results.</p>
    {summary_df_html}

    <h4>2.2 Clustering with Elbow Method</h4>
    <p>
        Using the same metadata-filtered dataset as above, KMeans clustering was applied 
        to identify natural groupings within the data. 
        The Elbow method is used to estimate the optimal number of clusters. 
        This helps in understanding how well-defined the group structures are within the filtered metadata.
    </p>
    {cluster_html}
    """

    styled_css = """
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        h2 { margin-top: 40px; }
        hr { margin: 40px 0; }

        .styled-table {
            border-collapse: collapse;
            margin: 20px 0;
            font-size: 14px;
            width: 50%;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
        }
        .styled-table thead tr {
            background-color: #2c3e50;
            color: #ffffff;
            text-align: center;
        }
        .styled-table th,
        .styled-table td {
            padding: 10px 12px;
            border: 1px solid #dddddd;
            text-align: center;
        }
        .styled-table tbody tr:nth-child(even) {
            background-color: #f3f3f3;
        }
    </style>
    """



    if not os.path.exists(output_path):
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(f"""
            <html>
            <head>
                <title>Metadata Visualization Report</title>
                <meta charset="utf-8">
                {styled_css}
            </head>
            <body>
                <h1 style="text-align:center;">Metadata Visualization Report</h1>
                {section_html}
            </body>
            </html>
            """)
    else:
        with open(output_path, "r", encoding="utf-8") as f:
            existing_html = f.read()

        updated_html = existing_html.replace("</body>", f"{section_html}</body>")

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(updated_html)

    print(f"Section appended to {output_path}")


In [None]:
import os

def append_semantic_search_section_to_html(
    title: str,
    semantic_html: str,
    output_path: str = "reports/semantic_search_report.html"
):
    section_html = f"""
    <hr>
    <h2 style="text-align:center; color:#34495e;">{title}</h2>

    <h3>Semantic Search Visualization</h3>
    <p>
        This section shows the results of a semantic search where the query is vectorized and compared 
        against high-dimensional document embeddings stored in the Qdrant vector database.
        The closest matches are projected into 2D space using dimensionality reduction (e.g., t-SNE or PCA or UMAP) 
        to help visualize how the semantically similar results are grouped.
    </p>
    {semantic_html}
    """

    if not os.path.exists(output_path):
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(f"""
            <html>
            <head>
                <title>Semantic Search Visualization Report</title>
                <meta charset="utf-8">
                <style>
                    body {{ font-family: Arial, sans-serif; margin: 20px; }}
                    h2 {{ margin-top: 40px; }}
                    hr {{ margin: 40px 0; }}
                </style>
            </head>
            <body>
                <h1 style="text-align:center;">Semantic Search Visualization Report</h1>
                {section_html}
            </body>
            </html>
            """)
    else:
        with open(output_path, "r", encoding="utf-8") as f:
            existing_html = f.read()

        updated_html = existing_html.replace("</body>", f"{section_html}</body>")

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(updated_html)

    print(f"Semantic search section appended to {output_path}")


In [None]:
from spotcheckValidation_test import *

In [None]:
import os

def append_styled_df_to_html(styled_df, label_key: str, filter_query: str, save_path: str = "reports/spot_check_validation_report.html"):

    title_html = f"<h2>Cluster Validation for {label_key}: {filter_query}</h2>\n"
    
    table_html = styled_df.to_html(escape=False)
    
    content_to_append = title_html + table_html + "<br><hr><br>\n"

    if not os.path.exists(save_path):
        with open(save_path, "w", encoding="utf-8") as f:
            f.write("<html><head><title>Cluster Validation Report</title></head><body>\n")

    with open(save_path, "a", encoding="utf-8") as f:
        f.write(content_to_append)

    print(f"Appended new section to {save_path}")



In [None]:
import pandas as pd
import os
import plotly.express as px
from sklearn.manifold import TSNE
import numpy as np

def metadata_grouping_and_plot_auto_hierarchy(label_key: str):
    parent_keys = get_parent_keys(label_key)
    print(f"Clustering on `{label_key}`, grouped by: {parent_keys}")
    
    metadata_list = get_all_metadata()
    groups = get_unique_metadata_groups(metadata_list, parent_keys)
    group_dataframes = []


    for idx, group in enumerate(groups, start=1):
        filter_query_parts = [f"{k}:{v}" for k, v in zip(parent_keys, group)]
        filter_query = " AND ".join(filter_query_parts)

        group_metadata = filter_metadata_by_group(metadata_list, parent_keys, group)
        label_values = sorted(set(m[label_key] for m in group_metadata if label_key in m))

        print(f"Label values for group: {label_values}")
        print(f"{idx}. Group: {filter_query} â€” Labels: {label_values}")

        df = get_vectors_and_metadata_as_dataframe(parent_keys, group)
        print(f"Fetched {len(df)} vectors and metadata for group: {filter_query}")
        semantic_df = df.copy(deep=True)
        clustering_df = df.copy(deep=True)

        group_dataframes.append(df)

        semantic_html = semantic_cluster_and_plot_by_label(semantic_df, label_key, group, parent_keys)

        if 'vector' not in df.columns:
            print("Skipping: 'vector' column missing.")
            continue

        vectors = df['vector'].tolist()
        vector_array = np.array(vectors)

        # pca = PCA(n_components=2)
        # reduced = pca.fit_transform(vector_array)
        n_neighbors = min(15, len(vector_array) - 1)
        reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=0.1, spread=1.0)
        reduced = reducer.fit_transform(vector_array)

        df['x'] = reduced[:, 0]
        df['y'] = reduced[:, 1]

        fig = px.scatter(
            df, x='x', y='y',
            color=label_key,
            hover_data=['x', 'y'] + [col for col in df.columns if col not in ['x', 'y', 'vector']],
            title=f"{label_key} distribution for group: {filter_query}"
        )

        fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='DarkSlateGrey')))
        fig.update_layout(
            width=900, height=700,
            title_x=0.5,
            legend_title_text=label_key,
        )

        df.insert(0, 'S.No', range(1, len(df) + 1))

        group_name = "__".join([f"{k}-{v}" for k, v in zip(parent_keys, group)])
        csv_path = f"metadata-graph-analysis/{group_name}.csv"
        html_path = f"metadata-graph-analysis/{group_name}.html"

        df.to_csv(csv_path, index=False)
        fig.write_html(html_path)
        metadata_html = fig.to_html(full_html=False, include_plotlyjs='cdn')
        cluster_html, optimal_cluster_csv_path = kmeans_clustering_with_elbow_and_plot(df.copy(deep=True), group_name, label_key)
        lable_count_cluster_html,lable_count_cluster_csv_path = kmeans_clustering_using_label_count(df.copy(deep=True), group_name, label_key)

        spotcheckValidationDF, summary_df = spotcheckValidation_test.validateAndReturnDF(metadata_path = csv_path, labelled_KMeans_path= lable_count_cluster_csv_path, label_key=label_key, parent_key =parent_keys, parent_values = group)
        append_styled_df_to_html(spotcheckValidationDF, label_key, filter_query)
        # display(spotcheckValidationDF)

        title = f"Visualization for group: {filter_query}"
        append_visualization_section_to_global_html(
            title=title,
            metadata_html=metadata_html,
            cluster_html=cluster_html,
            summary_df=summary_df,
            label_key=label_key,
            lable_count_cluster_html=lable_count_cluster_html,
        )
        append_semantic_search_section_to_html(
            title=title,
            semantic_html=semantic_html,
        )





In [None]:
import os
import shutil

def prepare_graph_analysis_folders():
    folders = [
        "metadata-graph-analysis",
        "semantic-graph-analysis",
        "cluster-graph-analysis",
        "reports",
        "semantic-cluster-graph-analysis"
    ]

    for folder in folders:
        try:
            if os.path.exists(folder):
                shutil.rmtree(folder)
                print(f"Deleted existing folder: {folder}")
            
            os.makedirs(folder)
            print(f"Created new folder: {folder}")
        
        except Exception as e:
            print(f"Error handling folder '{folder}': {e}")

prepare_graph_analysis_folders()


In [None]:
# metadata_grouping_and_plot_auto_hierarchy("subject")
metadata_grouping_and_plot_auto_hierarchy("domain")
metadata_grouping_and_plot_auto_hierarchy("skill")
metadata_grouping_and_plot_auto_hierarchy("subskill")
metadata_grouping_and_plot_auto_hierarchy("difficulty")