In [None]:
import sys
import pandas as pd
import numpy as np
from pymilvus import MilvusClient
from dotenv import load_dotenv
from scipy.stats import gaussian_kde
from sklearn.decomposition import PCA
import sys
from scipy import stats

load_dotenv()

import os

sys.path.append("/".join(os.getcwd().split("/")[:-2]))

In [None]:
data = pd.read_csv("../../data/labelled_data_dropped.csv")

data.describe()

In [None]:
print(data.count())
print(data["label"].value_counts())

In [None]:
# duplicates
duplicate_rows = data[data.duplicated()]
print(f"Number of duplicate rows: {len(duplicate_rows)}")
duplicate_rows

In [None]:
data.loc[data["label"] == 1, ["label"]].sum() / data["label"].count()

# Checking for Representativity


In [None]:
client = MilvusClient("../../data/embeddings.db")
labelled_data = pd.read_csv("../../data/labelled_data.csv", low_memory=False)
data = pd.read_csv("../../data/db.csv", low_memory=False)

In [None]:
population_uris = data["uri"].values
labelled_uris = []

for idx, row in labelled_data.iterrows():

    labelled_uris.append(row["1_uri"])
    labelled_uris.append(row["2_uri"])

labelled_uris = list(set(labelled_uris))

In [None]:
extracted_labelled_samples_vectors = []

for uri in labelled_uris:
    try:
        extracted_labelled_samples_vectors.append(
            client.get(collection_name="openai_small", ids=uri)[0]["vector"]
        )
    except Exception as e:
        print(f"Error for uri {uri}: {e}")
        continue


extracted_population_data = []


for uri in population_uris:
    try:
        extracted_population_data.append(
            client.get(collection_name="openai_small", ids=uri)[0]["vector"]
        )
    except Exception as e:
        print(f"Error for uri {uri}: {e}")
        continue

In [None]:
# Perform PCA on population_data
pca = PCA(n_components=16)
transform = pca.fit(extracted_population_data)
population_data_pca = transform.transform(extracted_population_data)
labelled_samples_vectors_pca = transform.transform(extracted_labelled_samples_vectors)

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)

print("Explained Variance Ratio per component:", explained_variance_ratio)
print("Cumulative Explained Variance:", cumulative_explained_variance)

In [None]:
population_data = np.array(population_data_pca)
labelled_samples = np.array(labelled_samples_vectors_pca)

kde_population = gaussian_kde(population_data.T, bw_method="silverman")
kde_labelled = gaussian_kde(labelled_samples.T, bw_method="silverman")

In [None]:
np.random.seed(0)

num_samples = 100000
samples = np.random.uniform(
    low=np.minimum(population_data.min(axis=0), labelled_samples.min(axis=0)),
    high=np.maximum(population_data.max(axis=0), labelled_samples.max(axis=0)),
    size=(num_samples, population_data.shape[1]),
)

# average distance between samples
avg_distance = np.mean(np.linalg.norm(samples, axis=1))

pdf_population = kde_population(samples.T)
pdf_labelled = kde_labelled(samples.T)

# Avoid division by zero and log(0)
epsilon = sys.float_info.min
pdf_population = np.maximum(pdf_population, epsilon)
pdf_labelled = np.maximum(pdf_labelled, epsilon)

kl = stats.entropy(pdf_population, pdf_labelled)

print("Estimated KL Divergence:", kl)

# Optionally, calculate reverse KL divergence
kl_reverse = stats.entropy(pdf_labelled, pdf_population)
print("Estimated Reverse KL Divergence:", kl_reverse)

# Duplicates in labelled data


In [None]:
data = pd.read_csv("../../data/labelled_data.csv", low_memory=False)

data_count = data.shape[0]
duplicates_count = data.duplicated().sum()

data_count, duplicates_count