In [1]:
import os
import pandas as pd
import numpy as np
import copy
# import hyperloglog
from datasketch import HyperLogLog

# Points
For now i am assuming that primary key is only a single attribute

In [2]:
class IND:
    def __init__(self, dependent, reference):
        self.dependent = dependent
        self.reference = reference

In [3]:
class Attribute:
    def __init__(self, table_name, attribute_name, values):
        self.table_name = table_name
        self.attribute_name = attribute_name
        self.values = values
        self.fullName=f"{self.table_name}.{self.attribute_name}"

        self.uniquness = self.estUniqueness()
        self.cardinality=1
        self.value_length = 1/max(1, max([len(x) for x in values]) - 8)
        self.position = 0
        self.suffix = self.check_suffix()

        self.pkScore = 0
        self.pkScore += self.uniquness
        self.pkScore += self.cardinality
        self.pkScore += self.value_length
        self.pkScore += self.position
        self.pkScore += self.suffix

    def estUniqueness(self):
        hll = HyperLogLog()
        total = 0
        
        for value in self.values:
            hll.update(str(value).encode('utf8'))
            total +=1
        # print(f"{hll.count()=}")
        
        return hll.count() / total
    
    def check_suffix(self, suffix_list=["key", 'id', 'nr', 'no']):
        for suffix in suffix_list:
            if suffix in self.attribute_name:
                return 1
            else:
                return 0


In [4]:
def load_csv_files(directory_path):
    attributes = {}

    csv_files = [f for f in os.listdir(directory_path)]

    print(f"Found {len(csv_files)} \n CSV files: {csv_files}")    

    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)
        table_name = os.path.splitext(filename)[0]

        df = pd.read_csv(file_path)
        df = df
        print(f"Processing {filename}: {df.shape[0]} rows, {df.shape[1]} columns")

        for i, column in enumerate(df.columns):
            non_null_values = df[column].astype(str).dropna().tolist()
            if non_null_values:
                attr = Attribute(table_name, column, non_null_values)
                attr.position = 1/(i+1)
                attributes[f"{table_name}.{column}"] = attr
                print(f"Added attribute: {attr.table_name}.{attr.attribute_name} Total Values: {len(attr.values)}")
                print(attr.position)

    return attributes
    
attributes = load_csv_files("/home/haseeb/Desktop/EKAI/ERD_automation/Dataset/train/northwind-db")            
            

Found 11 
 CSV files: ['employee_territories.csv', 'products.csv', 'orders.csv', 'customers.csv', 'territories.csv', 'orders_details.csv', 'suppliers.csv', 'employees.csv', 'categories.csv', 'shippers.csv', 'regions.csv']
Processing employee_territories.csv: 49 rows, 2 columns
Added attribute: employee_territories.employeeid Total Values: 49
1.0
Added attribute: employee_territories.territoryid Total Values: 49
0.5
Processing products.csv: 77 rows, 10 columns
Added attribute: products.productid Total Values: 77
1.0
Added attribute: products.productname Total Values: 77
0.5
Added attribute: products.supplierid Total Values: 77
0.3333333333333333
Added attribute: products.categoryid Total Values: 77
0.25
Added attribute: products.quantityperunit Total Values: 77
0.2
Added attribute: products.unitprice Total Values: 77
0.16666666666666666
Added attribute: products.unitsinstock Total Values: 77
0.14285714285714285
Added attribute: products.unitsonorder Total Values: 77
0.125
Added attribut

In [5]:
pk_table = {} # {table name: (column name , score)}
for key, value in attributes.items():
    exists = pk_table.get(key.split(".")[0])
    if not exists:
        pk_table[key.split(".")[0]] = (value.table_name + "." + value.attribute_name, value.pkScore)
    else:
       if exists[1] < value.pkScore:
        pk_table[key.split(".")[0]] = (value.table_name + "." + value.attribute_name, value.pkScore)
    # print(key + "->" + str(value.pkScore))
print(pk_table)

{'employee_territories': ('employee_territories.territoryid', np.float64(2.9852926422598527)), 'products': ('products.productid', np.float64(3.0264452376922204)), 'orders': ('orders.orderid', np.float64(2.985186227498581)), 'customers': ('customers.customerid', np.float64(3.0222949111208153)), 'territories': ('territories.territoryid', np.float64(2.9567150551527552)), 'orders_details': ('orders_details.orderid', np.float64(2.3794452755562983)), 'suppliers': ('suppliers.supplierid', np.float64(2.9838825129959528)), 'employees': ('employees.employeeid', np.float64(3.0180012900006483)), 'categories': ('categories.categoryid', np.float64(3.0159583460665687)), 'shippers': ('shippers.shipperid', np.float64(3.0119051356721522)), 'regions': ('regions.regionid', np.float64(3.0078948459609034))}


In [6]:
def read_IND(file_path):
    '''
        fills up the dependent and reference arrays of attributes
    '''
    inds = []
    with open(file_path, "r") as f:
        for line in f:
            vars = line.strip().split("=")
            inds.append(IND(attributes[vars[0]], attributes[vars[1]]))
    return inds
inds = read_IND("/home/haseeb/Desktop/EKAI/ERD_automation/codes/inclusionDependencyWithSpider/spider_results/northwind.txt")

In [7]:
print(inds[0].dependent.fullName)

employee_territories.employeeid


In [8]:
print(len(inds))

111


# If reference attribute isn't a primary key, remove the IND

In [None]:
pruned_inds = []
for ind in inds:
    # print(ind.dependent.fullName + "=" + ind.reference.fullName)
    is_pk = False
    for table_name, pk in pk_table.items():
        if pk[0].split(".")[1] == ind.reference.attribute_name:
            is_pk=True
            break
    if is_pk:
        pruned_inds.append(ind)

print(len(pruned_inds))

categoryid
employee_territories.territoryid
categoryid
products.productid
categoryid
orders.orderid
categoryid
customers.customerid
categoryid
territories.territoryid
categoryid
orders_details.orderid
categoryid
suppliers.supplierid
categoryid
employees.employeeid
categoryid
categories.categoryid
categoryid
shippers.shipperid
categoryid
regions.regionid
categoryid
employee_territories.territoryid
categoryid
products.productid
categoryid
orders.orderid
categoryid
customers.customerid
categoryid
territories.territoryid
categoryid
orders_details.orderid
categoryid
suppliers.supplierid
categoryid
employees.employeeid
categoryid
categories.categoryid
categoryid
shippers.shipperid
categoryid
regions.regionid
categoryid
employee_territories.territoryid
categoryid
products.productid
categoryid
orders.orderid
categoryid
customers.customerid
categoryid
territories.territoryid
categoryid
orders_details.orderid
categoryid
suppliers.supplierid
categoryid
employees.employeeid
categoryid
categories.c

In [10]:
for attribute_name, attribute in attributes.items():
    print(f"{attribute_name=}")
    new_reference = []
    for reference in attribute.reference:
        print(reference.table_name + "." + reference.attribute_name)
        is_pk = False
        for table_name, pk in pk_table.items():
            if pk[0] == reference.attribute_name:
                is_pk=True
                # break
        if is_pk:
            new_reference.append(reference)
        else:
            reference.dependent = []
    attribute.reference = new_reference
    # break
        

attribute_name='employee_territories.employeeid'


AttributeError: 'Attribute' object has no attribute 'reference'