In [131]:
import json
import pandas as pd
pd.set_option('display.max_rows', 500)
from sklearn.feature_extraction.text import CountVectorizer

In [127]:
MIGRATED_QUERY_PATH = "../Query Processing/4_json_results/migrated_query_data.json"
migrated_query_data = load_query_data(MIGRATED_QUERY_PATH)
tables, columns = extract_tables_and_columns(migrated_query_data)

tables = count_tables(tables)
columns = count_columns(columns)

In [133]:
tables.head(10)

Unnamed: 0,Table,Count
25,art_sup,50
139,divcode,36
75,bo_receipt_v,31
77,bo_receiptline_v,31
35,article,29
13,art_no,29
335,supplier,25
334,sup_no,25
258,partrev,23
156,g08t1,23


In [135]:
columns.head(10)

Unnamed: 0,Columns,Count
28,art_no,109
532,sup_no,98
442,partno,67
505,shp_no,60
443,partrev,59
469,rct_no,54
500,shortl62,53
340,item_type,53
84,bu_code_cre_shp,47
106,bu_type_cre_shp,47


##### `load_query_data`

In [2]:
def load_query_data(PATH):

    with open(PATH) as f:
        query_meta_data = json.load(f)
    return query_meta_data

##### `extract_tables_and_columns`

In [48]:
def extract_tables_and_columns(migrated_query_data):
    tables = []
    columns = []

    for query_key, query_value in migrated_query_data.items():
        try:
            for table in query_value['tables_cleansed']:
                tables.append(table)
            for column in query_value['columns_cleansed']:
                columns.append(column)
        except Exception as e:
            # print(f"Error reading tables from query {query_key}")
            # continue
            pass

    return tables, columns


##### `count_tables_and_columns`

In [110]:
def count_tables(tables):

    vector = CountVectorizer()
    table_counts = vector.fit_transform(tables)

    tables = vector.get_feature_names_out()
    counts = table_counts.toarray().sum(axis=0)
    # counts = table_counts.toarray().flatten()

    return pd.DataFrame({'Table': tables, 'Count': counts}).sort_values(by='Count', ascending=False)

In [111]:
def count_columns(columns):

    vector = CountVectorizer()
    columns_counts = vector.fit_transform(columns)
    columns = vector.get_feature_names_out()
    counts = columns_counts.toarray().sum(axis=0)

    return pd.DataFrame({'Columns': columns, 'Count': counts}).sort_values(by='Count', ascending=False)