In [2]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
pd.set_option('display.max_rows', 300)

##### `calculate_query_similarity`

In [3]:
def calculate_query_similarity():

    MIGRATED_QUERIES_PATH = "../Vector Embedding/json_results/migrated_query_data_with_embedding.json"
    migrated_queries = load_query_data(MIGRATED_QUERIES_PATH)

    NEW_QUERIES_PATH = "../Vector Embedding/json_results/new_query_data_with_embedding.json"
    new_queries = load_query_data(NEW_QUERIES_PATH)

    return calculate_similarity(migrated_queries, new_queries)

In [9]:
res = calculate_query_similarity()

TypeError: string indices must be integers, not 'str'

In [7]:
res.to_excel('./Results/similarity_report_embedding.xlsx', index=False)

##### `load_query_data`

In [4]:
def load_query_data(PATH):

    with open(PATH) as f:
        query_meta_data = json.load(f)

    return query_meta_data

##### `calculate_similarity`

In [None]:
def calculate_similarity(migrated_queries, new_queries):

    similarities = []

    for new_query_name, new_query_value in new_queries.items():
        vec1 = np.array(new_query_value.get("embedding", [])).reshape(1, -1)

        for migrated_query_name, migrated_query_value in migrated_queries.items():
            vec2 = np.array(migrated_query_value.get("embedding", [])).reshape(1, -1)

            similarities.append(
                [new_query_name, migrated_query_name, cosine_similarity(vec1, vec2).flatten()[0],
                 np.intersect1d(
                     new_query_value.get("columns_cleansed", []),
                     migrated_query_value.get("columns_cleansed", [])
                    ),
                 np.intersect1d(
                     new_query_value.get("tables_cleansed", []),
                     migrated_query_value.get("tables_cleansed", [])
                    )
                ]
            )

    return pd.DataFrame(
        data=similarities,
        columns=[
            "new_report_name",
            "migrated_report_name",
            "similarity",
            "matching_columns",
            "matching_tables"
        ]
    ).sort_values(by=["new_report_name", "similarity"], ascending=[True, False])