In [8]:
import sqlite3
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from Levenshtein import distance as levenshtein_distance
from rank_bm25 import BM25Okapi
import spacy
import pandas as pd
import networkx as nx

# Keyword Extractor
class KeywordExtractor:
    def __init__(self, method="sentence_transformer"):
        self.method = method
        self.nlp = spacy.load("en_core_web_sm")  # Load spaCy for phrase extraction
        if method == "sentence_transformer":
            self.tokenizer = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        else:
            raise ValueError("Supported method: 'sentence_transformer'.")

    def extract_keywords(self, query):
        doc = self.nlp(query)
        phrases = [chunk.text for chunk in doc.noun_chunks]  # Extract noun phrases
        return phrases

# Embedding Handler
class EmbeddingHandler:
    def __init__(self, method="sentence_transformer"):
        self.method = method
        if method == "sentence_transformer":
            self.model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
        else:
            raise ValueError("Supported method: 'sentence_transformer'.")

    def get_embedding(self, text):
        return self.model.encode(text)

    def calculate_similarity(self, emb1, emb2, metric="cosine", **kwargs):
        if metric == "cosine":
            return cosine_similarity([emb1], [emb2])[0][0]
        elif metric == "euclidean":
            return -np.linalg.norm(np.array(emb1) - np.array(emb2))
        elif metric == "manhattan":
            return -np.sum(np.abs(np.array(emb1) - np.array(emb2)))
        elif metric == "jaccard":
            set1, set2 = set(kwargs.get("set1", [])), set(kwargs.get("set2", []))
            if not set1 or not set2:  # Handle empty sets
                return 0  # Return 0 similarity for empty comparisons
            return len(set1.intersection(set2)) / len(set1.union(set2))
        elif metric == "levenshtein":
            str1, str2 = kwargs.get("str1", ""), kwargs.get("str2", "")
            if not str1 or not str2:  # Handle empty strings
                return 0  # Return 0 similarity for empty comparisons
            max_len = max(len(str1), len(str2))
            return 1 - levenshtein_distance(str1, str2) / max_len
        else:
            raise ValueError("Unsupported similarity metric.")

# Schema Extractor
class SchemaExtractor:
    def __init__(self, db_file):
        self.db_file = db_file

    def extract_schema(self):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        schema = {}
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        for table in tables:
            table_name = table[0]
            cursor.execute(f"PRAGMA table_info({table_name});")
            columns = cursor.fetchall()
            schema[table_name] = [col[1] for col in columns]
        conn.close()
        return schema

# Connections
class Connections:
    def __init__(self, db_file):
        self.db_file = db_file

    def extract_keys(self, table_name):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        primary_keys = []
        foreign_keys = []

        # Extract primary keys
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        for col in columns:
            if col[-1] == 1:  # Primary key indicator
                primary_keys.append(col[1])

        # Extract foreign keys
        cursor.execute(f"PRAGMA foreign_key_list({table_name});")
        keys = cursor.fetchall()
        for key in keys:
            foreign_keys.append({
                "from_column": key[3],
                "to_table": key[2],
                "to_column": key[4]
            })

        conn.close()
        return primary_keys, foreign_keys

# RAG Pipeline
class RAGPipeline:
    def __init__(self, db_file, user_query, method="sentence_transformer", similarity_metric="cosine", similarity_threshold=0.4):
        self.db_file = db_file
        self.user_query = user_query
        self.similarity_metric = similarity_metric
        self.similarity_threshold = similarity_threshold
        self.embedding_handler = EmbeddingHandler(method=method)
        self.keyword_extractor = KeywordExtractor(method=method)
        self.schema_extractor = SchemaExtractor(db_file)
        self.schema = self.schema_extractor.extract_schema()
        self.connections = Connections(db_file)

    def identify_relevant_tables(self, keywords):
        results = []
        for keyword in keywords:
            keyword_embedding = self.embedding_handler.get_embedding(keyword)
            tables = []
            similarities = []
            for table, columns in self.schema.items():
                table_embedding = self.embedding_handler.get_embedding(table)
                similarity = self.embedding_handler.calculate_similarity(
                    table_embedding, keyword_embedding, metric=self.similarity_metric,
                    str1=table, str2=keyword  # Pass strings for Levenshtein
                )
                if similarity >= self.similarity_threshold:
                    tables.append(table)
                    similarities.append(similarity)
            results.append({
                "Keyword": keyword,
                "Identified Tables": tables,
                "Similarity Scores": similarities
            })
        return pd.DataFrame(results)

    def identify_relevant_columns(self, keywords):
        """
        Identify relevant columns in the database schema based on the extracted keywords,
        independently of the relevance of the parent table.
        """
        results = []
        for keyword in keywords:
            keyword_embedding = self.embedding_handler.get_embedding(keyword)
            columns = []
            similarities = []
            for table, cols in self.schema.items():  # Iterate through all tables and columns
                for column in cols:
                    column_name = f"{table}.{column}"
                    column_embedding = self.embedding_handler.get_embedding(column_name)
                    similarity = self.embedding_handler.calculate_similarity(
                        column_embedding, keyword_embedding, metric=self.similarity_metric,
                        str1=column_name, str2=keyword  # Pass strings for Levenshtein
                    )
                    if similarity >= self.similarity_threshold:
                        columns.append(column_name)
                        similarities.append(similarity)
            # Sort columns by similarity scores (descending)
            sorted_columns = sorted(zip(columns, similarities), key=lambda x: x[1], reverse=True)
            results.append({
                "Keyword": keyword,
                "Identified Columns": [col[0] for col in sorted_columns],
                "Similarity Scores": [col[1] for col in sorted_columns]
            })
        return pd.DataFrame(results)


    def run(self):
        print("Extracting keywords...")
        keywords = self.keyword_extractor.extract_keywords(self.user_query)
        print(f"Keywords: {keywords}")

        print("Identifying relevant tables and calculating similarity scores...")
        table_similarity_df = self.identify_relevant_tables(keywords)

        print("Similarity Scores and Keys as Table:")
        print(table_similarity_df.to_string(index=False))

        print("Identifying relevant columns and calculating similarity scores...")
        column_similarity_df = self.identify_relevant_columns(keywords)

        print("Similarity Scores for Columns as Table:")
        print(column_similarity_df.to_string(index=False))

        return table_similarity_df, column_similarity_df

# Main Runner
if __name__ == "__main__":
    DB_FILE = "company_data.db"
    USER_QUERY = "on average how many people are managed by Grade F managers?"
    METHOD = "sentence_transformer"  # Use sentence-level embedding
    SIMILARITY_METRIC = "cosine"  # Choose from "cosine", "euclidean", "manhattan", "jaccard", "levenshtein"

    pipeline = RAGPipeline(DB_FILE, USER_QUERY, method=METHOD, similarity_metric=SIMILARITY_METRIC)
    table_similarity_scores, column_similarity_scores = pipeline.run()


Extracting keywords...
Keywords: ['how many people', 'Grade F managers']
Identifying relevant tables and calculating similarity scores...
Similarity Scores and Keys as Table:
         Keyword Identified Tables Similarity Scores
 how many people          [people]      [0.59598446]
Grade F managers                []                []
Identifying relevant columns and calculating similarity scores...
Similarity Scores for Columns as Table:
         Keyword                              Identified Columns                   Similarity Scores
 how many people [people.full_name, people.fte, people.division] [0.40601167, 0.4373518, 0.42937395]
Grade F managers                     [people.line_manager_grade]                        [0.52981186]


In [9]:
# Print the entire schema
from pprint import pprint

def print_schema(db_file):
    schema_extractor = SchemaExtractor(db_file)
    schema = schema_extractor.extract_schema()
    pprint(schema)

# Example usage
DB_FILE = "company_data.db"
print("Database Schema:")
print_schema(DB_FILE)

Database Schema:
{'art': ['resource_id',
         'full_name',
         'group',
         'start_date',
         'compliance_program',
         'course_name',
         'course_completion_date',
         'art_due_date',
         'course_completion_status',
         'final_art_status',
         'wd_file_date',
         'run_date'],
 'holiday_balance': ['colleague_id',
                     'report_date',
                     'carried_forward',
                     'accrued',
                     'entitlement',
                     'booked',
                     'taken',
                     'booked_not_yet_taken',
                     'unbooked',
                     'untaken',
                     'using_workday',
                     'hours_not_using_workday',
                     'Unbooked_ex_nuw'],
 'holiday_details': ['colleague_id',
                     'hours',
                     'date',
                     'entered_on',
                     'report_date',
                     '

In [1]:
# Check if the OpenAI API key is registered in the environment
import os

# Attempt to retrieve the OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("OpenAI API key is registered.")
else:
    print("OpenAI API key is NOT registered. Please set it correctly.")


OpenAI API key is registered.


In [None]:
people,"people.direct_line_manager_id, people.resource_id, people.grade"