In [1]:
import sqlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import glob
import pandas as pd
pd.set_option('max_colwidth', 400)
import re

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:
def calculate_query_similarity_with_tf_idf():

    migrated_queries = read_in_excel_files("../Query Processing/1_migrated_excel_queries/*.xlsx")
    migrated_queries_preprocessed = preprocess_query_df(migrated_queries, "new")

    new_queries = read_in_excel_files("../Query Processing/2_new_excel_queries/*.xlsx")
    new_queries_preprocessed = preprocess_query_df(new_queries, "new1")

    return calculate_query_similarity(migrated_queries_preprocessed, new_queries_preprocessed)

In [6]:
res = calculate_query_similarity_with_tf_idf()
res.to_excel("./Results/similarity_report_tf_idf.xlsx", index=False)

2025-04-04 09:21:44,343 - INFO - Successfully read 1 excel file(s) from ../Query Processing/2_new_excel_queries/*.xlsx.
2025-04-04 09:21:44,344 - INFO - Started preprocess_query_df for new queries.


KeyboardInterrupt: 

##### `read_in_excel_files`

In [3]:
def read_in_excel_files(PATH):
    try:
        excel_data = glob.glob(PATH)

        dataframes = [pd.read_excel(data, engine="openpyxl") for data in excel_data]
        query_df = pd.concat(dataframes, ignore_index=True)

        logging.info(f"Successfully read {len(excel_data)} excel file(s) from {PATH}.")

    except Exception as e:
        logging.error(f"Error in reading excel files: {e}")

    return query_df

##### `preprocess_query_df`

In [None]:
def preprocess_query_df(query_df, QUERY_IDENTIFIER):
    """
    Preprocesses a DataFrame of SQL queries by cleaning and formatting the SQL strings.

    Parameters:
    query_df (DataFrame): A DataFrame containing SQL queries with a column named 'SQL'.
    QUERY_IDENTIFIER (str): A string identifier for the type of queries being processed, used for logging purposes.

    Returns:
    DataFrame: The preprocessed DataFrame with cleaned and formatted SQL queries.

    The method performs the following steps:
    1. Logs the start of the preprocessing.
    2. Drops rows with missing 'SQL' values and fills other missing values with an empty string.
    3. Replaces special characters in the 'SQL' column with their ASCII equivalents.
    4. Applies several regex patterns to clean and format the SQL queries:
        - Removes 'WITH' clauses.
        - Strips schema names and table aliases.
        - Replaces single quotes with double quotes and removes newline characters.
        - Strips comments and reindents the SQL.
        - Removes redundant spaces and standardizes alias formatting.
    """
    logging.info(f"Started {preprocess_query_df.__name__} for {QUERY_IDENTIFIER} queries.")

    query_df = query_df.dropna(subset=["SQL"]).fillna("")

    query_df["SQL"] = (query_df["SQL"]
                       .str.replace('ê', 'e').str.replace('é', 'e').str.replace('è', 'e').str.replace('à', 'a').str.replace('ç', 'c')
                       .str.replace('ô', 'o').str.replace('û', 'u').str.replace('ù', 'u').str.replace('î', 'i').str.replace('ï', 'i')
                       .str.replace('â', 'a').str.replace('ä', 'a').str.replace('ö', 'o').str.replace('ü', 'u').str.replace('ÿ', 'y')
                       .str.replace('ñ', 'n').str.replace('É', 'E').str.replace('È', 'E').str.replace('À', 'A').str.replace('Ç', 'C')
                       .str.replace('Ô', 'O').str.replace('Û', 'U').str.replace('Ù', 'U').str.replace('Î', 'I').str.replace('Ï', 'I')
                       .str.replace('Â', 'A').str.replace('Ä', 'A').str.replace('Ö', 'O').str.replace('Ü', 'U').str.replace('Ÿ', 'Y')
                       .str.replace('Ñ', 'N')
    )

    with_pattern = r'WITH\s+"\w+"\s+AS\s*\(.*?\)\s*SELECT'
    pattern_2 = r'"[^"]*"\.'
    pattern_3 = r'\w+\.'

    for index, query in enumerate(query_df["SQL"]):
        formatted_query = query.upper()
        formatted_query = sqlparse.format(formatted_query, reindent=True, keyword_case='upper', strip_comments=True).strip()
        formatted_query = formatted_query.replace("'", '"').replace("\n", " ")
        formatted_query = re.sub(with_pattern, 'SELECT', formatted_query, flags=re.DOTALL)
        formatted_query = re.sub(pattern_2, '', formatted_query)
        formatted_query = re.sub(pattern_3, '', formatted_query)
        formatted_query = " ".join(formatted_query.split())
        formatted_query = re.sub(r'"(\w+)" AS "\1"', r'"\1"', formatted_query)
        formatted_query = re.sub(r'("\w+")\s+"\w+"', r'\1', formatted_query)

        query_df.at[index, 'SQL'] = formatted_query

    logging.info(f"Preprocessing of {QUERY_IDENTIFIER} queries successful.")

    return query_df

##### `calculate_query_similarity`

In [None]:
def calculate_query_similarity(migrated_queries_preprocessed, new_queries_preprocessed):
    """
    Calculates the similarity between new and migrated SQL queries using TF-IDF vectorization and cosine similarity.

    Parameters:
    migrated_queries_preprocessed (DataFrame): A DataFrame containing preprocessed migrated SQL queries with columns 'SQL' and 'Product Name'.
    new_queries_preprocessed (DataFrame): A DataFrame containing preprocessed new SQL queries with columns 'SQL' and 'Product Name'.

    Returns:
    DataFrame: A DataFrame containing the top 3 most similar migrated queries for each new query, with columns:
        - 'new_report_name': Name of the new report.
        - 'new_report_name_1': Name of the migrated report.
        - 'similarity': Similarity score between the new and migrated report.
        - 'new_report_sql': SQL of the new report.
        - 'new_report_sql_1': SQL of the migrated report.
        - 'row_number': Rank of the similarity score for each new report.

    The method performs the following steps:
    1. Vectorizes the SQL queries using TF-IDF.
    2. Computes the cosine similarity between new and migrated queries.
    3. Constructs a similarity matrix and filters out identical report names.
    4. Creates a DataFrame of similarity scores and ranks the top 3 most similar migrated queries for each new query.
    """
    similarity_matrix = cosine_similarity(tfidf_new, tfidf_migrated)

    similarity = []

    for i in range(len(new_queries_preprocessed)):
        for j in range(len(migrated_queries_preprocessed)):
            similarity.append({
                'new_report_name': str(new_queries_preprocessed['Product Name'][i]).strip(),
                'migrated_report_name': str(migrated_queries_preprocessed['Product Name'][j]).strip(),
                'similarity': similarity_matrix[i][j],
                'new_report_sql': str(new_queries_preprocessed['SQL'][i]).strip(),
                'migrated_report_sql': str(migrated_queries_preprocessed['SQL'][j]).strip(),
            })

    similarity = [entry for entry in similarity if entry['new_report_name'] != entry['new_report_name_1']]

    similarity_df = pd.DataFrame(similarity).drop_duplicates(subset=["new_report_name", "migrated_report_name"]).sort_values(by=['new_report_name', 'similarity'], ascending=[True, False])
    similarity_df["row_number"] = similarity_df.groupby("new_report_name").cumcount() + 1

    return similarity_df[similarity_df["row_number"] < 4]