In [1]:
import sys
import os
from pathlib import Path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from src.config.config_loader import ConfigLoader
from src.config.factory import ServiceFactory

  from tqdm.autonotebook import tqdm, trange
  functions.register_function("flatten", flatten)


In [2]:
config_path = os.path.join(project_root, 'config.yaml')
config = ConfigLoader.load_config(config_path)
print(config)

AppConfig(database=DatabaseConfig(type='postgres', host='localhost', port=5432, database='postgres', user='postgres', password='admin', schema='video_games', warehouse=None, account=None, role=None), llm=LLMConfig(type='ollama', api_key=None, model='llama3.1', base_url='http://localhost:11434', language=<SupportedLanguage.ENGLISH: 'english'>), prompt=PromptConfig(include_sample_data=True, max_sample_rows=3), vector_store=VectorStoreConfig(enabled=True, type='qdrant', collection_name='video_games_store', path='./data/video_games_store', url=None, embedding=EmbeddingConfig(type='huggingface', model_name='sentence-transformers/multi-qa-MiniLM-L6-cos-v1', api_key=None), api_key=None, batch_size=100), debug=True)


In [3]:
db = ServiceFactory.create_db_connector(config.database)
if not db.connect():
    raise RuntimeError("Failed to connect to database")

In [4]:
metadata_retriever = ServiceFactory.create_metadata_retriever(config.database, db)

In [5]:
metadata = metadata_retriever.get_all_tables_metadata()
metadata

{'genre': TableMetadata(name='genre', columns=[{'name': 'id', 'type': 'INTEGER', 'nullable': False}, {'name': 'genre_name', 'type': 'VARCHAR(50)', 'nullable': True}], primary_keys=['id'], foreign_keys=[], row_count=12),
 'publisher': TableMetadata(name='publisher', columns=[{'name': 'id', 'type': 'INTEGER', 'nullable': False}, {'name': 'publisher_name', 'type': 'VARCHAR(100)', 'nullable': True}], primary_keys=['id'], foreign_keys=[], row_count=577),
 'platform': TableMetadata(name='platform', columns=[{'name': 'id', 'type': 'INTEGER', 'nullable': False}, {'name': 'platform_name', 'type': 'VARCHAR(50)', 'nullable': True}], primary_keys=['id'], foreign_keys=[], row_count=31),
 'region': TableMetadata(name='region', columns=[{'name': 'id', 'type': 'INTEGER', 'nullable': False}, {'name': 'region_name', 'type': 'VARCHAR(50)', 'nullable': True}], primary_keys=['id'], foreign_keys=[], row_count=4),
 'game': TableMetadata(name='game', columns=[{'name': 'id', 'type': 'INTEGER', 'nullable': Fals

In [6]:
for table_name, table_info in metadata.items():
    print(f"\nTabella: {table_name}")
    print("Colonne:")
    for col in table_info.columns:
        print(f"  - {col}")
    print(f"Primary keys: {table_info.primary_keys}")
    print(f"Foreign keys: {table_info.foreign_keys}")
    print(f"Row count: {table_info.row_count}")


Tabella: genre
Colonne:
  - {'name': 'id', 'type': 'INTEGER', 'nullable': False}
  - {'name': 'genre_name', 'type': 'VARCHAR(50)', 'nullable': True}
Primary keys: ['id']
Foreign keys: []
Row count: 12

Tabella: publisher
Colonne:
  - {'name': 'id', 'type': 'INTEGER', 'nullable': False}
  - {'name': 'publisher_name', 'type': 'VARCHAR(100)', 'nullable': True}
Primary keys: ['id']
Foreign keys: []
Row count: 577

Tabella: platform
Colonne:
  - {'name': 'id', 'type': 'INTEGER', 'nullable': False}
  - {'name': 'platform_name', 'type': 'VARCHAR(50)', 'nullable': True}
Primary keys: ['id']
Foreign keys: []
Row count: 31

Tabella: region
Colonne:
  - {'name': 'id', 'type': 'INTEGER', 'nullable': False}
  - {'name': 'region_name', 'type': 'VARCHAR(50)', 'nullable': True}
Primary keys: ['id']
Foreign keys: []
Row count: 4

Tabella: game
Colonne:
  - {'name': 'id', 'type': 'INTEGER', 'nullable': False}
  - {'name': 'genre_id', 'type': 'INTEGER', 'nullable': True}
  - {'name': 'game_name', 'type'

In [None]:
from typing import Dict, Any, List
import re
from openai import OpenAI
import os

In [14]:
def generate_table_description(client: OpenAI, table_info) -> str:
    """Genera descrizione della tabella usando OpenAI"""
    
    # COSTRUZIONE DEL PROMPT
    
    # colonne
    columns_info = [
        f"- {col['name']} ({col['type']}) {'NOT NULL' if not col['nullable'] else ''}"
        for col in table_info.columns
    ]
    
    # fks
    foreign_keys_info = []
    for fk in table_info.foreign_keys:
        from_cols = ', '.join(fk['constrained_columns'])
        to_table = fk['referred_table']
        to_cols = ', '.join(fk['referred_columns'])
        foreign_keys_info.append(f"- {from_cols} -> {to_table}({to_cols})")

    # template
    prompt = f"""Analyze this database table and provide a concise description of its purpose and content.

Table: {table_info.name}
Number of records: {table_info.row_count}

Columns:
{chr(10).join(columns_info)}

Primary Keys: {', '.join(table_info.primary_keys)}

Foreign Keys:
{chr(10).join(foreign_keys_info) if foreign_keys_info else 'No foreign keys'}

Provide a clear and concise description in 2-3 sentences."""

    # invia il prompt e ricevi la descrizione
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a database expert providing concise table descriptions."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.2,
    )
    
    return response.choices[0].message.content.strip()

In [13]:
def extract_keywords(table_info) -> List[str]:
    """Estrae keywords dai metadati della tabella"""
    keywords = set()
    
    # dal nome della tabella
    table_words = split_camel_case(table_info.name)
    keywords.update(table_words)
    
    # dai nomi delle colonne
    for col in table_info.columns:
        col_words = split_camel_case(col['name'])
        keywords.update(col_words)
        
    # dalle tabelle correlate
    for fk in table_info.foreign_keys:
        keywords.add(fk['referred_table'])
    
    # rimuovi parole comuni e converti in minuscolo
    common_words = {'id', 'code', 'type', 'name', 'date', 'created', 'modified', 'status'}
    keywords = {word.lower() for word in keywords if word.lower() not in common_words}
    
    return sorted(list(keywords))

def split_camel_case(s: str) -> List[str]:
    """Divide una stringa in camel case o snake case nelle sue parole componenti"""
    # Gestisce snake_case
    words = s.split('_')
    result = []
    
    for word in words:
        # Gestisce CamelCase
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', word)
        result.extend(match.group(0) for match in matches)
    
    return [w for w in result if w]  # Rimuove stringhe vuote

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Any
from src.config.models.metadata import T

@dataclass
class EnhancedTableMetadata:
    base_metadata: TableMetadata
    description: str
    keywords: List[str]


def enhance_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
    """Arricchisce i metadati esistenti con descrizione e keywords"""
    
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    
    # per ogni tabella nel dizionario dei metadati, arrichiamo con descrizione e keywords
    for table_name, table_info in metadata.items():
        
        description = generate_table_description(client, table_info)
        keywords = extract_keywords(table_info)
        
        # aggiungi i nuovi campi al dizionario esistente
        table_info['description'] = description
        table_info['keywords'] = keywords
        
    return metadata

In [16]:
enhanced_metadata = enhance_metadata(metadata)

TypeError: 'TableMetadata' object does not support item assignment