In [8]:
import sqlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import glob
import pandas as pd
pd.set_option('max_colwidth', 400)
import re

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def calculate_query_similarity_with_tf_idf():

    # migrated_queries = read_in_excel_files("../Query Processing/1_migrated_excel_queries/*.xlsx")
    migrated_queries = read_in_excel_files("../Query Processing/2_new_excel_queries/*.xlsx")
    migrated_queries_preprocessed = preprocess_query_df(migrated_queries, "new")

    new_queries = read_in_excel_files("../Query Processing/2_new_excel_queries/*.xlsx")
    new_queries_preprocessed = preprocess_query_df(new_queries, "new1")

    return calculate_query_similarity(migrated_queries_preprocessed, new_queries_preprocessed)

In [22]:
res = calculate_query_similarity_with_tf_idf()
res.to_excel("./Results/similarity_report_tf_idf.xlsx", index=False)

2025-04-01 14:41:46,829 - INFO - Successfully read 1 excel file(s) from ../Query Processing/2_new_excel_queries/*.xlsx.
2025-04-01 14:41:46,830 - INFO - Started preprocess_query_df for new queries.
2025-04-01 14:42:40,029 - INFO - Preprocessing of new queries successful.
2025-04-01 14:42:40,089 - INFO - Successfully read 1 excel file(s) from ../Query Processing/2_new_excel_queries/*.xlsx.
2025-04-01 14:42:40,089 - INFO - Started preprocess_query_df for new queries.
2025-04-01 14:43:34,692 - INFO - Preprocessing of new queries successful.


##### `read_in_excel_files`

In [3]:
def read_in_excel_files(PATH):
    try:
        excel_data = glob.glob(PATH)

        dataframes = [pd.read_excel(data, engine="openpyxl") for data in excel_data]
        query_df = pd.concat(dataframes, ignore_index=True)

        logging.info(f"Successfully read {len(excel_data)} excel file(s) from {PATH}.")

    except Exception as e:
        logging.error(f"Error in reading excel files: {e}")

    return query_df

##### `preprocess_query_df`

In [4]:
def preprocess_query_df(query_df, QUERY_IDENTIFIER):
    logging.info(f"Started {preprocess_query_df.__name__} for {QUERY_IDENTIFIER} queries.")

    query_df = query_df.dropna(subset=["SQL"]).fillna("")

    query_df["SQL"] = (query_df["SQL"]
                       .str.replace('ê', 'e').str.replace('é', 'e').str.replace('è', 'e').str.replace('à', 'a').str.replace('ç', 'c')
                       .str.replace('ô', 'o').str.replace('û', 'u').str.replace('ù', 'u').str.replace('î', 'i').str.replace('ï', 'i')
                       .str.replace('â', 'a').str.replace('ä', 'a').str.replace('ö', 'o').str.replace('ü', 'u').str.replace('ÿ', 'y')
                       .str.replace('ñ', 'n').str.replace('É', 'E').str.replace('È', 'E').str.replace('À', 'A').str.replace('Ç', 'C')
                       .str.replace('Ô', 'O').str.replace('Û', 'U').str.replace('Ù', 'U').str.replace('Î', 'I').str.replace('Ï', 'I')
                       .str.replace('Â', 'A').str.replace('Ä', 'A').str.replace('Ö', 'O').str.replace('Ü', 'U').str.replace('Ÿ', 'Y')
                       .str.replace('Ñ', 'N')
    )

    with_pattern = r'WITH\s+"\w+"\s+AS\s*\(.*?\)\s*SELECT'
    pattern_2 = r'"[^"]*"\.'
    pattern_3 = r'\w+\.'

    for index, query in enumerate(query_df["SQL"]):
        formatted_query = query.upper()
        formatted_query = sqlparse.format(formatted_query, reindent=True, keyword_case='upper', strip_comments=True).strip()
        formatted_query = formatted_query.replace("'", '"').replace("\n", " ")
        formatted_query = re.sub(with_pattern, 'SELECT', formatted_query, flags=re.DOTALL)
        formatted_query = re.sub(pattern_2, '', formatted_query)
        formatted_query = re.sub(pattern_3, '', formatted_query)
        formatted_query = " ".join(formatted_query.split())
        formatted_query = re.sub(r'"(\w+)" AS "\1"', r'"\1"', formatted_query)
        formatted_query = re.sub(r'("\w+")\s+"\w+"', r'\1', formatted_query)

        query_df.at[index, 'SQL'] = formatted_query

    logging.info(f"Preprocessing of {QUERY_IDENTIFIER} queries successful.")

    return query_df

##### `calculate_query_similarity`

In [None]:
# def calculate_query_similarity(migrated_queries_preprocessed, new_queries_preprocessed):

#     vectorizer = TfidfVectorizer()
#     tfidf_migrated = vectorizer.fit_transform(migrated_queries_preprocessed['SQL'])
#     tfidf_new = vectorizer.transform(new_queries_preprocessed['SQL'])

#     similarity_matrix = cosine_similarity(tfidf_new, tfidf_migrated)

#     similarity = []

#     for i in range(len(new_queries_preprocessed)):
#         for j in range(len(migrated_queries_preprocessed)):
#             similarity.append({
#                 'new_report_name': new_queries_preprocessed['Product Name'][i],
#                 'migrated_report_name': migrated_queries_preprocessed['Report Name'][j],
#                 'similarity': similarity_matrix[i][j],
#                 'new_report_sql': new_queries_preprocessed['SQL'][i],
#                 'migrated_report_sql': migrated_queries_preprocessed['SQL'][j],
#             })

#     similarity_df = pd.DataFrame(similarity).drop_duplicates(subset=["new_report_name", "migrated_report_name"]).sort_values(by=['new_report_name', 'similarity'], ascending=[True, False])
#     similarity_df["row_number"] = similarity_df.groupby("new_report_name").cumcount() + 1

#     return similarity_df[similarity_df["row_number"] < 4]
def calculate_query_similarity(migrated_queries_preprocessed, new_queries_preprocessed):

    vectorizer = TfidfVectorizer()
    tfidf_migrated = vectorizer.fit_transform(migrated_queries_preprocessed['SQL'])
    tfidf_new = vectorizer.transform(new_queries_preprocessed['SQL'])

    similarity_matrix = cosine_similarity(tfidf_new, tfidf_migrated)

    similarity = []

    for i in range(len(new_queries_preprocessed)):
        for j in range(len(migrated_queries_preprocessed)):
            similarity.append({
                'new_report_name': str(new_queries_preprocessed['Product Name'][i]).strip(),
                'new_report_name_1': str(migrated_queries_preprocessed['Product Name'][j]).strip(),
                'similarity': similarity_matrix[i][j],
                'new_report_sql': str(new_queries_preprocessed['SQL'][i]).strip(),
                'new_report_sql_1': str(migrated_queries_preprocessed['SQL'][j]).strip(),
            })

    similarity = [entry for entry in similarity if entry['new_report_name'] != entry['new_report_name_1']]

    similarity_df = pd.DataFrame(similarity).drop_duplicates(subset=["new_report_name", "new_report_name_1"]).sort_values(by=['new_report_name', 'similarity'], ascending=[True, False])
    similarity_df["row_number"] = similarity_df.groupby("new_report_name").cumcount() + 1

    return similarity_df[similarity_df["row_number"] < 4]
