In [1]:
from sqlanalyzer import column_parser
import pandas as pd

## given db metadata

In [2]:
db_fields = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['account_health_c', 'account_health_flag_c', 'account_health_last_touch_c', 'account_notes_c', 'account_owner_c', 'account_owner_id_c', 'account_segment_c', 'account_source', 'account_start_date_c', 'account_tier_c', 'add_company_tags_single_c', 'annual_revenue', 'billing_city', 'billing_country', 'billing_postal_code', 'billing_state', 'billing_street', 'churned_date_c', 'created_by_id', 'created_date', 'crunchbase_funding_c', 'csm_c', 'customer_tier_c', 'domain_c', 'dscorgpkg_lead_source_c', 'dscorgpkg_naics_codes_c', 'dscorgpkg_sic_codes_c', 'finance_arr_c', 'github_issue_ticket_c', 'health_update_c', 'id', 'industry', 'industry_group_c', 'industry_sector_c', 'initial_deal_arr_c', 'initial_deal_date_c', 'is_deleted', 'last_activity_date', 'last_modified_date', 'lfbn_account_domain_c', 'lost_opportunities_c', 'lost_renewals_c', 'mapbox_username_c', 'naics_code_c', 'name', 'netsuite_conn_channel_tier_c', 'next_renewal_date_c', 'number_of_employees', 'number_of_mapbox_users_c', 'open_opportunities_c', 'open_renewals_c', 'owner_id', 'owner_role_c', 'parent_id', 'partner_status_c', 'partner_type_c', 'primary_contact_c', 'primary_use_case_c', 'rating', 'record_type_id', 'region_c', 'renewal_manager_c', 'sb_pf_company_c', 'sdr_c', 'segmentation_c', 'shipping_city', 'shipping_country', 'shipping_postal_code', 'shipping_state', 'shipping_street', 'sic', 'solution_engineer_c', 'sub_industry_c', 'sub_region_c', 'support_engineer_c', 'type', 'vertical_c', 'vertical_formula_c', 'won_opportunities_c', 'x18_digit_account_id_c', 'zendesk_result_c', 'zendesk_zendesk_organization_c', 'zendesk_zendesk_organization_id_c', 'zisf_zoominfo_industry_c', 'dt']})
db_fields

Unnamed: 0,db_table,all_columns
0,sfdc.accounts,account_health_c
1,sfdc.accounts,account_health_flag_c
2,sfdc.accounts,account_health_last_touch_c
3,sfdc.accounts,account_notes_c
4,sfdc.accounts,account_owner_c
...,...,...
80,sfdc.accounts,zendesk_result_c
81,sfdc.accounts,zendesk_zendesk_organization_c
82,sfdc.accounts,zendesk_zendesk_organization_id_c
83,sfdc.accounts,zisf_zoominfo_industry_c


## given query

In [3]:
query = """WITH opportunity_to_name AS (
                SELECT  -- make sure there is only one name per id
                id AS account_id, name AS account_name FROM sfdc.accounts sfdc_accounts
                WHERE dt = '{run_date}' GROUP BY id, name) SELECT * FROM opportunity_to_name 
                where dt > (select MAX(datetime) from sfdc.opportunities as sfdc_oppty)
"""

## transformation 1: format query

In [4]:
formatter = column_parser.Parser(query)

In [5]:
formatted = formatter.format_query(query)
print(formatted)

WITH opportunity_to_name AS
  (SELECT id AS account_id,
          name AS account_name
   FROM sfdc.accounts sfdc_accounts
   WHERE dt = '{run_date}'
   GROUP BY id,
            name)
SELECT *
FROM opportunity_to_name
WHERE dt >
    (SELECT MAX(datetime)
     FROM sfdc.opportunities AS sfdc_oppty)


## transformation 2: separate CTE's

In [6]:
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': 'SELECT *\nFROM opportunity_to_name\nWHERE dt >\n    (SELECT MAX(datetime)\n     FROM sfdc.opportunities AS sfdc_oppty)',
 'opportunity_to_name': "WITH opportunity_to_name AS\n  (SELECT id AS account_id,\n          name AS account_name\n   FROM sfdc.accounts sfdc_accounts\n   WHERE dt = '{run_date}'\n   GROUP BY id,\n            name)\n"}

## transformation 3: match table aliases

In [7]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

{'sfdc_accounts': 'sfdc.accounts',
 'opportunity_to_name': 'opportunity_to_name',
 'sfdc_oppty': 'sfdc.opportunities'}

## transformation 4: find columns

In [8]:
formatter.match_queried_fields(query, db_fields)

[{'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'name'},
 {'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'dt'},
 {'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'id'}]

## bonus transformation: upload other query metadata (such as timestamp, user)