In [23]:
import sqlite3
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
import spacy
import pandas as pd

# Keyword Extractor
class KeywordExtractor:
    def __init__(self, method="sentence_transformer"):
        self.method = method
        self.nlp = spacy.load("en_core_web_sm")  # Load spaCy for phrase extraction
        if method == "sentence_transformer":
            self.tokenizer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        else:
            raise ValueError("Supported method: 'sentence_transformer'.")

    def extract_keywords(self, query):
        doc = self.nlp(query)
        phrases = [chunk.text for chunk in doc.noun_chunks]  # Extract noun phrases
        return phrases

# Embedding Handler
class EmbeddingHandler:
    def __init__(self, method="sentence_transformer"):
        self.method = method
        if method == "sentence_transformer":
            self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        else:
            raise ValueError("Supported method: 'sentence_transformer'.")

    def get_embedding(self, text):
        return self.model.encode(text)

    def calculate_similarity(self, emb1, emb2, metric="cosine", **kwargs):
        if metric == "cosine":
            return cosine_similarity([emb1], [emb2])[0][0]
        elif metric == "euclidean":
            return -np.linalg.norm(np.array(emb1) - np.array(emb2))
        elif metric == "manhattan":
            return -np.sum(np.abs(np.array(emb1) - np.array(emb2)))
        elif metric == "levenshtein":
            str1, str2 = kwargs.get("str1", ""), kwargs.get("str2", "")
            if not str1 or not str2:  # Handle empty strings
                return 0  # Return 0 similarity for empty comparisons
            max_len = max(len(str1), len(str2))
            return 1 - levenshtein_distance(str1, str2) / max_len
        else:
            raise ValueError("Unsupported similarity metric.")

# Schema Extractor
class SchemaExtractor:
    def __init__(self, db_file):
        self.db_file = db_file

    def extract_schema(self):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        schema = {}
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        for table in tables:
            table_name = table[0]
            cursor.execute(f"PRAGMA table_info({table_name});")
            columns = cursor.fetchall()
            schema[table_name] = [col[1] for col in columns]
        conn.close()
        return schema

# RAG Pipeline
class RAGPipeline:
    def __init__(self, db_file, user_query, method="sentence_transformer", similarity_metric="cosine", similarity_threshold=0.45):
        self.db_file = db_file
        self.user_query = user_query
        self.similarity_metric = similarity_metric
        self.similarity_threshold = similarity_threshold
        self.embedding_handler = EmbeddingHandler(method=method)
        self.keyword_extractor = KeywordExtractor(method=method)
        self.schema_extractor = SchemaExtractor(db_file)
        self.schema = self.schema_extractor.extract_schema()

    def identify_relevant_columns(self, keywords):
        results = []
        identified_columns = set()
        for keyword in keywords:
            keyword_embedding = self.embedding_handler.get_embedding(keyword)
            keyword_results = {"Keyword": keyword, "Identified Columns": [], "Similarity Scores": []}
            for table, cols in self.schema.items():
                for column in cols:
                    column_name = f"{table}.{column}"
                    column_embedding = self.embedding_handler.get_embedding(column_name)
                    similarity = self.embedding_handler.calculate_similarity(
                        column_embedding, keyword_embedding, metric=self.similarity_metric
                    )
                    if similarity >= self.similarity_threshold:
                        identified_columns.add(column_name)
                        keyword_results["Identified Columns"].append(column_name)
                        keyword_results["Similarity Scores"].append(similarity)
            results.append(keyword_results)
        return pd.DataFrame(results), identified_columns


    def extract_tables_from_columns(self, identified_columns):
        identified_tables = set()
        for column in identified_columns:
            table_name = column.split('.')[0]
            identified_tables.add(table_name)
        return identified_tables

    def extract_primary_keys(self, table_name):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        primary_keys = []

        # Extract primary keys
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        for col in columns:
            if col[-1] == 1:  # Primary key indicator
                primary_keys.append(f"{table_name}.{col[1]}")

        conn.close()
        return primary_keys

    def extract_relevant_columns_with_keys(self, identified_columns, identified_tables):
        relevant_columns = set(identified_columns)  # Start with identified columns

        for table in identified_tables:
            # Add primary keys for the table
            primary_keys = self.extract_primary_keys(table)
            relevant_columns.update(primary_keys)

        return relevant_columns

    def run(self):
        print("Extracting keywords...")
        keywords = self.keyword_extractor.extract_keywords(self.user_query)
        print(f"Keywords: {keywords}")

        print("Identifying relevant columns and calculating similarity scores...")
        column_similarity_df, identified_columns = self.identify_relevant_columns(keywords)
        print("Similarity Scores for Columns as Table:")
        print(column_similarity_df.to_string(index=False))

        print("Extracting relevant tables from columns...")
        identified_tables = self.extract_tables_from_columns(identified_columns)

        print("Extracting relevant columns including primary keys...")
        relevant_columns = self.extract_relevant_columns_with_keys(identified_columns, identified_tables)

        print("Relevant Columns Including Primary Keys:")
        print(relevant_columns)

        return column_similarity_df, identified_tables, relevant_columns

# Main Runner
if __name__ == "__main__":
    DB_FILE = "company_data.db"
    USER_QUERY = "How many colleagues work Onshore vs Offshore?"
    METHOD = "sentence_transformer"  # Use sentence-level embedding
    SIMILARITY_METRIC = "cosine"  # Choose from "cosine", "euclidean", "manhattan", "levenshtein"
    SIMILARITY_THRESHOLD = 0.5

    # Initialize pipeline
    pipeline = RAGPipeline(DB_FILE, USER_QUERY, method=METHOD, similarity_metric=SIMILARITY_METRIC, similarity_threshold=SIMILARITY_THRESHOLD)
    column_similarity_scores, identified_tables, relevant_columns = pipeline.run()

    print("\nFinal Identified Tables:")
    print(identified_tables)
    print("\nFinal Relevant Columns Including Primary Keys:")
    print(relevant_columns)


Extracting keywords...
Keywords: ['How many colleagues', 'Onshore', 'Offshore']
Identifying relevant columns and calculating similarity scores...
Similarity Scores for Columns as Table:
            Keyword  Identified Columns Similarity Scores
How many colleagues                  []                []
            Onshore [people.is_onshore]       [0.7412474]
           Offshore [people.is_onshore]       [0.5254764]
Extracting relevant tables from columns...
Extracting relevant columns including primary keys...
Relevant Columns Including Primary Keys:
{'people.resource_id', 'people.is_onshore'}

Final Identified Tables:
{'people'}

Final Relevant Columns Including Primary Keys:
{'people.resource_id', 'people.is_onshore'}


In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

def print_formatted_evaluation(question_no, question, expected_tables, expected_columns, predicted_tables, predicted_columns):
    print("\n" + "="*50)
    print(f"Evaluating Question {question_no}:")
    print(f"Question: {question}")
    print("="*50)

    # Print Expected and Predicted Tables
    print("\nExpected Tables:")
    print(", ".join(map(str, expected_tables)) if expected_tables else "None")
    print("\nPredicted Tables:")
    print(", ".join(map(str, predicted_tables)) if predicted_tables else "None")

    # Print Expected and Predicted Columns
    print("\nExpected Columns:")
    print(", ".join(map(str, expected_columns)) if expected_columns else "None")
    print("\nPredicted Columns:")
    print(", ".join(map(str, predicted_columns)) if predicted_columns else "None")
    print("="*50)

def evaluate_rag_pipeline(rag_pipeline, eval_data, question_no=None, n_questions=None):
    evaluation_results = []

    # Filter evaluation data based on the question_no or n_questions
    if question_no is not None:
        eval_data = eval_data[eval_data['question_no'] == question_no]
    elif n_questions is not None:
        eval_data = eval_data.head(n_questions)

    table_f1_scores = []
    column_f1_scores = []
    table_precision_scores = []
    column_precision_scores = []
    table_recall_scores = []
    column_recall_scores = []
    table_accuracy_scores = []
    column_accuracy_scores = []

    for index, row in eval_data.iterrows():
        question = row['user_question']
        expected_tables = set(row['tables'].split(", "))  # Split tables into a set
        expected_columns = set(row['columns'].split(", "))  # Split columns into a set

        # Run the RAG pipeline to get predicted tables and columns
        rag_pipeline.user_query = question
        column_similarity_df, predicted_tables, predicted_columns = rag_pipeline.run()

        # Evaluate tables
        correct_tables = expected_tables.intersection(predicted_tables)
        missed_tables = expected_tables - predicted_tables
        extra_tables = predicted_tables - expected_tables

        # Evaluate columns
        correct_columns = expected_columns.intersection(predicted_columns)
        missed_columns = expected_columns - predicted_columns
        extra_columns = predicted_columns - expected_columns

        # Calculate Precision, Recall, F1, and Accuracy for tables
        table_precision = len(correct_tables) / len(predicted_tables) if predicted_tables else 0
        table_recall = len(correct_tables) / len(expected_tables) if expected_tables else 0
        table_f1 = (
            2 * (table_precision * table_recall) / (table_precision + table_recall)
            if (table_precision + table_recall) > 0
            else 0
        )
        table_accuracy = len(correct_tables) / len(expected_tables.union(predicted_tables)) if expected_tables.union(predicted_tables) else 0

        table_f1_scores.append(table_f1)
        table_precision_scores.append(table_precision)
        table_recall_scores.append(table_recall)
        table_accuracy_scores.append(table_accuracy)

        # Calculate Precision, Recall, F1, and Accuracy for columns
        column_precision = len(correct_columns) / len(predicted_columns) if predicted_columns else 0
        column_recall = len(correct_columns) / len(expected_columns) if expected_columns else 0
        column_f1 = (
            2 * (column_precision * column_recall) / (column_precision + column_recall)
            if (column_precision + column_recall) > 0
            else 0
        )
        column_accuracy = len(correct_columns) / len(expected_columns.union(predicted_columns)) if expected_columns.union(predicted_columns) else 0

        column_f1_scores.append(column_f1)
        column_precision_scores.append(column_precision)
        column_recall_scores.append(column_recall)
        column_accuracy_scores.append(column_accuracy)

        # Record formatted output for this question
        print_formatted_evaluation(
            question_no=row['question_no'],
            question=question,
            expected_tables=expected_tables,
            expected_columns=expected_columns,
            predicted_tables=predicted_tables,
            predicted_columns=predicted_columns
        )

        # Record results for this question
        evaluation_results.append({
            'question_no': row['question_no'],
            'user_question': question,
            'correct_tables': list(correct_tables),
            'missed_tables': list(missed_tables),
            'extra_tables': list(extra_tables),
            'correct_columns': list(correct_columns),
            'missed_columns': list(missed_columns),
            'extra_columns': list(extra_columns),
            'table_precision': table_precision,
            'table_recall': table_recall,
            'table_f1': table_f1,
            'table_accuracy': table_accuracy,
            'column_precision': column_precision,
            'column_recall': column_recall,
            'column_f1': column_f1,
            'column_accuracy': column_accuracy,
        })

    # Print average scores
    avg_table_f1 = sum(table_f1_scores) / len(table_f1_scores) if table_f1_scores else 0
    avg_column_f1 = sum(column_f1_scores) / len(column_f1_scores) if column_f1_scores else 0
    avg_table_precision = sum(table_precision_scores) / len(table_precision_scores) if table_precision_scores else 0
    avg_column_precision = sum(column_precision_scores) / len(column_precision_scores) if column_precision_scores else 0
    avg_table_recall = sum(table_recall_scores) / len(table_recall_scores) if table_recall_scores else 0
    avg_column_recall = sum(column_recall_scores) / len(column_recall_scores) if column_recall_scores else 0
    avg_table_accuracy = sum(table_accuracy_scores) / len(table_accuracy_scores) if table_accuracy_scores else 0
    avg_column_accuracy = sum(column_accuracy_scores) / len(column_accuracy_scores) if column_accuracy_scores else 0

    print("\nFinal Scores:")
    print(f"Average Table F1 Score: {avg_table_f1:.4f}")
    print(f"Average Table Precision: {avg_table_precision:.4f}")
    print(f"Average Table Recall: {avg_table_recall:.4f}")
    print(f"Average Table Accuracy: {avg_table_accuracy:.4f}")
    print(f"Average Column F1 Score: {avg_column_f1:.4f}")
    print(f"Average Column Precision: {avg_column_precision:.4f}")
    print(f"Average Column Recall: {avg_column_recall:.4f}")
    print(f"Average Column Accuracy: {avg_column_accuracy:.4f}")

    return pd.DataFrame(evaluation_results)

# Assuming we have a `rag_pipeline` instance
if __name__ == "__main__":
    # Set up RAG Pipeline
    DB_FILE = "company_data.db"
    METHOD = "sentence_transformer"  # Use sentence-level embedding
    SIMILARITY_METRIC = "cosine"  # Choose from "cosine", "euclidean", "manhattan", "levenshtein"

    # Load the evaluation dataset
    eval_data_path = "processed_sql_rag_eval.csv"
    eval_data = pd.read_csv(eval_data_path)

    # Initialize pipeline (no user query needed for evaluation)
    pipeline = RAGPipeline(DB_FILE, user_query="", method=METHOD, similarity_metric=SIMILARITY_METRIC)

    # Evaluate RAG Pipeline
    evaluation_df = evaluate_rag_pipeline(rag_pipeline=pipeline, eval_data=eval_data, n_questions=45)

    # Display results
    print("RAG Pipeline Evaluation Results:")
    print(evaluation_df.to_string(index=False))


Extracting keywords...
Keywords: ['How many colleagues', 'ART', 'Q3', '- ART']
Identifying relevant columns and calculating similarity scores...
Similarity Scores for Columns as Table:
            Keyword                                                                                                                 Identified Columns                                                                           Similarity Scores
How many colleagues                                                                                                           [people.colleague_layer]                                                                                [0.45413017]
                ART [art.resource_id, art.full_name, art.group, art.start_date, art.course_name, art.art_due_date, art.final_art_status, art.run_date] [0.55793834, 0.7506598, 0.7629296, 0.48986623, 0.6215074, 0.59318125, 0.6344775, 0.5255929]
                 Q3                                                                   

In [14]:
# Print the entire schema
from pprint import pprint

def print_schema(db_file):
    schema_extractor = SchemaExtractor(db_file)
    schema = schema_extractor.extract_schema()
    pprint(schema)

# Example usage
DB_FILE = "company_data.db"
print("Database Schema:")
print_schema(DB_FILE)

Database Schema:
{'art': ['resource_id',
         'full_name',
         'group',
         'start_date',
         'compliance_program',
         'course_name',
         'course_completion_date',
         'art_due_date',
         'course_completion_status',
         'final_art_status',
         'wd_file_date',
         'run_date'],
 'holiday_balance': ['colleague_id',
                     'report_date',
                     'carried_forward',
                     'accrued',
                     'entitlement',
                     'booked',
                     'taken',
                     'booked_not_yet_taken',
                     'unbooked',
                     'untaken',
                     'using_workday',
                     'hours_not_using_workday',
                     'Unbooked_ex_nuw'],
 'holiday_details': ['colleague_id',
                     'hours',
                     'date',
                     'entered_on',
                     'report_date',
                     '

In [27]:
import sqlite3

def extract_table_and_column_descriptions(db_file):
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    descriptions = {}
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    for table in tables:
        table_name = table[0]
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        descriptions[table_name] = [
            {"column_name": col[1], "data_type": col[2], "not_null": bool(col[3]), "default_value": col[4]} for col in columns
        ]

    conn.close()
    return descriptions

# Example usage
if __name__ == "__main__":
    db_file = "company_data.db"
    table_descriptions = extract_table_and_column_descriptions(db_file)
    for table, columns in table_descriptions.items():
        print(f"Table: {table}")
        for column in columns:
            print(f"  Column: {column['column_name']}, Type: {column['data_type']}, Not Null: {column['not_null']}, Default: {column['default_value']}")


Table: people
  Column: resource_id, Type: TEXT, Not Null: False, Default: None
  Column: full_name, Type: TEXT, Not Null: False, Default: None
  Column: email, Type: TEXT, Not Null: False, Default: None
  Column: first_name, Type: TEXT, Not Null: False, Default: None
  Column: last_name, Type: TEXT, Not Null: False, Default: None
  Column: direct_line_manager_id, Type: TEXT, Not Null: False, Default: None
  Column: direct_line_manager_name, Type: TEXT, Not Null: False, Default: None
  Column: line_manager_email, Type: TEXT, Not Null: False, Default: None
  Column: location_name, Type: TEXT, Not Null: False, Default: None
  Column: data_source, Type: TEXT, Not Null: False, Default: None
  Column: hub_location, Type: TEXT, Not Null: False, Default: None
  Column: is_onshore, Type: TEXT, Not Null: False, Default: None
  Column: fte, Type: REAL, Not Null: False, Default: None
  Column: scheduled_working_hours, Type: REAL, Not Null: False, Default: None
  Column: grade, Type: TEXT, Not Nul