## Create SQL table

In [24]:
import pandas as pd
from google.cloud.sql.connector import Connector
import sqlalchemy
from utils_sql import *
from sqlalchemy import text, Table, Column, String, Integer, Float, MetaData, PrimaryKeyConstraint
import pyodbc

img_benchmark = "image_nvai_benchmarks"
img_metrics = "image_nvai_metrics"
vid_benchmarks = "video_nvai_benchmarks"
vid_metrics = "video_nvai_metrics"

In [25]:
BENCHMARK_PATH = "/Users/irinakw/Library/CloudStorage/GoogleDrive-i.white@neuronsinc.com/Shared drives/HQ - R&D/Benchmark Documents/"

data_img_benchmarks = pd.read_csv(BENCHMARK_PATH + f'Master Sheets/Benchmark CSVs/insights_image_level_newmetrics.csv')
data_img_metrics = pd.read_csv(BENCHMARK_PATH + 'eng_mem_images_total_metrics.csv')

data_vid_benchmarks = pd.read_csv(BENCHMARK_PATH + f'Master Sheets/Benchmark CSVs/insights_video_level_newmetrics.csv')
data_vid_metrics = pd.read_csv(BENCHMARK_PATH + 'eng_mem_videos_total_metrics.csv')

In [31]:
def infer_sqlalchemy_types(df):
    type_mapping = {
        "int64": Integer,
        "float64": Float,
        "object": String,
        "datetime64[ns]": String,  # You can use DateTime if required
        "bool": Integer,  # Convert boolean to Integer (0/1)
    }
    column_types = {}
    for col, dtype in df.dtypes.items():
        dtype_str = str(dtype)
        column_types[col] = type_mapping.get(dtype_str, String)  # Default to String
    return column_types

def list_all_tables(conn):
    query = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public';"
    return conn.execute(text(query)).fetchall()

In [30]:
# Create SQLAlchemy engine
engine = sqlalchemy.create_engine(
    "postgresql+pg8000://",
    creator=get_connection
)

In [33]:
metadata = MetaData()
column_types = infer_sqlalchemy_types(data_vid_benchmarks)
vid_benchmarks = Table(
    "data_vid_benchmarks",
    metadata,
    *[
        Column(col, col_type) for col, col_type in column_types.items()
    ],
    PrimaryKeyConstraint(
        "industry_category", "industry_subcategory", "usecase_category",
        "usecase_subcategory", "platform", "device", "context",
        "metric", "time", "type"
    )
)
metadata.create_all(engine)
inspector = sqlalchemy.inspect(engine)
tables = inspector.get_table_names()
print("Tables in the database:", tables)

Tables in the database: ['data_vid_benchmarks', 'image_nvai_benchmarks']


In [43]:
DB_CONFIG = {
    "DRIVER": "PostgreSQL Unicode",
    "SERVER": "127.0.0.1",
    "PORT": "5432",
    "DATABASE": "assets-experiment",
    "UID": "i.white@neuronsinc.com",
    "PWD": "",
}

# Create a connection and a cursor
conn_string = ";".join([f"{key}={value}" for key, value in DB_CONFIG.items()])
print(conn_string)
conn = pyodbc.connect(conn_string)
cursor = conn.cursor()
print("Connection successful!")

query = """
SELECT table_name
FROM information_schema.tables
WHERE table_schema = 'public';
"""

cursor.execute(query)
tables = [row[0] for row in cursor.fetchall()]
for _ in tables:
    print(_)

DRIVER=PostgreSQL Unicode;SERVER=127.0.0.1;PORT=5432;DATABASE=assets-experiment;UID=i.white@neuronsinc.com;PWD=
Connection successful!


In [44]:
with engine.connect() as conn:
    # Use `text` to create an executable SQL statement
    query = text("""
        SELECT table_name
        FROM information_schema.tables
        WHERE table_schema = 'public';
    """)
    result = conn.execute(query)
    print("Tables in the database:")
    for row in result:
        print(row[0])

Tables in the database:
data_vid_benchmarks
image_nvai_benchmarks


In [37]:
def count_rows_in_table(engine, table_name):
    query = text(f"""
        SELECT COUNT(*) AS row_count
        FROM {table_name}
    """)  # Query to count rows in the table
    with engine.connect() as conn:
        result = conn.execute(query)
        row_count = result.scalar()  # Fetch the scalar value (row count)
        print(f"Number of all rows in the table '{table_name}': {row_count}")

print(f"Number of rows in the file img_benchmarks: {data_img_benchmarks.shape[0]}")
count_rows_in_table(engine, img_benchmark)

Number of rows in the file img_benchmarks: 87320
Number of all rows in the table 'image_nvai_benchmarks': 31600


In [35]:
def update_table(df, table_name):    
    print(f"Number of rows in the CSV file: {len(df)}")
    
    # Filter missing rows
    comparison_columns = df.columns.tolist()
    query = f"SELECT {', '.join(comparison_columns)} FROM {table_name}"
    cursor.execute(query)
    existing_rows = {tuple(row) for row in cursor.fetchall()}  
    df["composite_key"] = df[comparison_columns].apply(tuple, axis=1)
    df_missing = df[~df["composite_key"].isin(existing_rows)]
    df_missing = df_missing.drop(columns=["composite_key"])  # Drop helper column
    print(f"Number of missing rows to add: {len(df_missing)}")

    if len(df_missing) > 0:
        # Bulk insert missing rows into the table
        for index, row in df_missing.iterrows():
            row_values = tuple(row)  # Convert row to a tuple
            placeholders = ", ".join(["?"] * len(row))  # Prepare placeholders
            query = f"INSERT INTO {table_name} VALUES ({placeholders})"
            cursor.execute(query, row_values)
        conn.commit()
    else:
        print("No new rows to add.")

    if conn:
        conn.close()


In [36]:
update_table(data_img_benchmarks, img_benchmark)


Number of rows in the CSV file: 87320


ProgrammingError: ('42P01', '[42P01] ERROR: relation "image_nvai_benchmarks" does not exist;\nError while executing the query (1) (SQLExecDirectW)')

### IMAGES

In [None]:
DESIRED_INDUSTRY = 'all'
DESIRED_SUBINDUSTRY = 'all'
DESIRED_USECASE = 'digital_ads'
DESIRED_SUBUSECASE = 'out_of_home_ads'
DESIRED_PLATFORM = []  

query_img_benchmarks(
    engine, usecase_category=DESIRED_USECASE
)
