In [2]:
import sqlite3
import torch
import numpy as np
from transformers import RobertaTokenizer, RobertaModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import networkx as nx
#another one
# Keyword Extractor
class KeywordExtractor:
    def __init__(self, method="roberta"):
        if method == "roberta":
            self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
            self.model = RobertaModel.from_pretrained("roberta-base")
        else:
            raise ValueError("Only 'roberta' method is currently supported.")

    def extract_keywords(self, query):
        inputs = self.tokenizer(query, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
            token_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        return query.split()  # Simple split for now (replace with improved logic)


# Embedding Handler
class EmbeddingHandler:
    def __init__(self):
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.model = RobertaModel.from_pretrained("roberta-base")

    def get_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    def calculate_similarity(self, emb1, emb2):
        return cosine_similarity([emb1], [emb2])[0][0]


# Schema Extractor
class SchemaExtractor:
    def __init__(self, db_file):
        self.db_file = db_file

    def extract_schema(self):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        schema = {}
        cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
        tables = cursor.fetchall()
        for table in tables:
            table_name = table[0]
            cursor.execute(f"PRAGMA table_info({table_name});")
            columns = cursor.fetchall()
            schema[table_name] = [col[1] for col in columns]
        conn.close()
        return schema


# Connections
class Connections:
    def __init__(self, db_file):
        self.db_file = db_file

    def extract_keys(self, table_name):
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        primary_keys = []
        foreign_keys = []

        # Extract primary keys
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()
        for col in columns:
            if col[-1] == 1:  # Primary key indicator
                primary_keys.append(col[1])

        # Extract foreign keys
        cursor.execute(f"PRAGMA foreign_key_list({table_name});")
        keys = cursor.fetchall()
        for key in keys:
            foreign_keys.append({
                "from_column": key[3],
                "to_table": key[2],
                "to_column": key[4]
            })

        conn.close()
        return primary_keys, foreign_keys


# RAG Pipeline
class RAGPipeline:
    def __init__(self, db_file, user_query, similarity_threshold=0.4):
        self.db_file = db_file
        self.user_query = user_query
        self.similarity_threshold = similarity_threshold
        self.embedding_handler = EmbeddingHandler()
        self.keyword_extractor = KeywordExtractor()
        self.schema_extractor = SchemaExtractor(db_file)
        self.schema = self.schema_extractor.extract_schema()
        self.connections = Connections(db_file)

    def identify_relevant_tables(self, keywords):
        results = []
        for keyword in keywords:
            keyword_embedding = self.embedding_handler.get_embedding(keyword)
            tables = []
            similarities = []
            for table, columns in self.schema.items():
                table_embedding = self.embedding_handler.get_embedding(table)
                similarity = self.embedding_handler.calculate_similarity(
                    table_embedding, keyword_embedding
                )
                if similarity >= self.similarity_threshold:
                    tables.append(table)
                    similarities.append(similarity)
            results.append({
                "Keyword": keyword,
                "Identified Tables": tables,
                "Similarity Scores": similarities
            })
        return pd.DataFrame(results)

    def identify_relevant_columns(self, keywords):
        results = []
        for keyword in keywords:
            keyword_embedding = self.embedding_handler.get_embedding(keyword)
            columns = []
            similarities = []
            for table, cols in self.schema.items():
                for column in cols:
                    column_name = f"{table}.{column}"
                    column_embedding = self.embedding_handler.get_embedding(column_name)
                    similarity = self.embedding_handler.calculate_similarity(
                        column_embedding, keyword_embedding
                    )
                    if similarity >= self.similarity_threshold:
                        columns.append(column_name)
                        similarities.append(similarity)
            results.append({
                "Keyword": keyword,
                "Identified Columns": columns,
                "Similarity Scores": similarities
            })
        return pd.DataFrame(results)

    def build_subgraph(self, table_similarity_df, column_similarity_df):
        graph = nx.Graph()

        # Add nodes for tables
        for index, row in table_similarity_df.iterrows():
            for table in row["Identified Tables"]:
                graph.add_node(table, type="table")

        # Add nodes for columns and edges to their tables
        for index, row in column_similarity_df.iterrows():
            for column in row["Identified Columns"]:
                table_name = column.split('.')[0]
                graph.add_node(column, type="column")
                graph.add_edge(table_name, column)

        # Add edges between tables based on foreign keys
        for table in self.schema.keys():
            _, foreign_keys = self.connections.extract_keys(table)
            for fk in foreign_keys:
                graph.add_edge(table, fk["to_table"],
                               from_column=fk["from_column"],
                               to_column=fk["to_column"])

        print("Subgraph Nodes:")
        print(graph.nodes(data=True))

        print("Subgraph Edges:")
        print(graph.edges(data=True))

        return graph

    def run(self):
        print("Extracting keywords...")
        keywords = self.keyword_extractor.extract_keywords(self.user_query)
        print(f"Keywords: {keywords}")

        print("Identifying relevant tables and calculating similarity scores...")
        table_similarity_df = self.identify_relevant_tables(keywords)

        print("Similarity Scores and Keys as Table:")
        print(table_similarity_df.to_string(index=False))

        print("Identifying relevant columns and calculating similarity scores...")
        column_similarity_df = self.identify_relevant_columns(keywords)

        print("Similarity Scores for Columns as Table:")
        print(column_similarity_df.to_string(index=False))

        print("Building subgraph of relevant tables and columns...")
        subgraph = self.build_subgraph(table_similarity_df, column_similarity_df)

        return table_similarity_df, column_similarity_df, subgraph

#random comment
# Main Runner
if __name__ == "__main__":
    DB_FILE = "company_data.db"
    USER_QUERY = "How many people have booked at least 50 hours before 01/05/2024?"

    pipeline = RAGPipeline(DB_FILE, USER_QUERY)
    table_similarity_scores, column_similarity_scores, subgraph = pipeline.run()


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracting keywords...
Keywords: ['How', 'many', 'people', 'have', 'booked', 'at', 'least', '50', 'hours', 'before', '01/05/2024?']
Identifying relevant tables and calculating similarity scores...
Similarity Scores and Keys as Table:
    Keyword                               Identified Tables                                Similarity Scores
        How [people, art, holiday_balance, holiday_details]    [0.98557395, 0.9840101, 0.962958, 0.95748514]
       many [people, art, holiday_balance, holiday_details]    [0.9954591, 0.9922095, 0.96482253, 0.9569932]
     people [people, art, holiday_balance, holiday_details]    [1.0000001, 0.99318254, 0.9658507, 0.9580249]
       have [people, art, holiday_balance, holiday_details]   [0.99313766, 0.9910682, 0.9624611, 0.95439327]
     booked [people, art, holiday_balance, holiday_details]  [0.97246534, 0.9716304, 0.96912444, 0.96429884]
         at [people, art, holiday_balance, holiday_details]      [0.9893686, 0.9891387, 0.956673, 0.9503047]
   