In [4]:
import os
import pandas as pd
import numpy as np
import copy
# import hyperloglog
from datasketch import HyperLogLog

# Points
For now i am assuming that primary key is only a single attribute

In [5]:
class Attribute:
    def __init__(self, table_name, attribute_name, values):
        self.table_name = table_name
        self.attribute_name = attribute_name
        self.values = values

        self.uniquness = self.estUniqueness()
        self.cardinality=1
        self.value_length = 1/max(1, max([len(x) for x in values]) - 8)
        self.position = 0
        self.suffix = self.check_suffix()
        
        self.dependent = [] # It's dependent attributes
        self.reference = [] # Attributes that it references from 

        self.pkScore = 0
        self.pkScore += self.uniquness
        self.pkScore += self.cardinality
        self.pkScore += self.value_length
        self.pkScore += self.position
        self.pkScore += self.suffix

    def estUniqueness(self):
        hll = HyperLogLog()
        total = 0
        
        for value in self.values:
            hll.update(str(value).encode('utf8'))
            total +=1
        # print(f"{hll.count()=}")
        
        return hll.count() / total
    
    def check_suffix(self, suffix_list=["key", 'id', 'nr', 'no']):
        for suffix in suffix_list:
            if suffix in self.attribute_name:
                return 1
            else:
                return 0


In [6]:
def load_csv_files(directory_path):
    attributes = {}

    csv_files = [f for f in os.listdir(directory_path)]

    print(f"Found {len(csv_files)} \n CSV files: {csv_files}")    

    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)
        table_name = os.path.splitext(filename)[0]

        df = pd.read_csv(file_path)
        df = df
        print(f"Processing {filename}: {df.shape[0]} rows, {df.shape[1]} columns")

        for i, column in enumerate(df.columns):
            non_null_values = df[column].astype(str).dropna().tolist()
            if non_null_values:
                attr = Attribute(table_name, column, non_null_values)
                attr.position = 1/(i+1)
                attributes[f"{table_name}.{column}"] = attr
                print(f"Added attribute: {attr.table_name}.{attr.attribute_name} Total Values: {len(attr.values)}")
                print(attr.position)

    return attributes
    
attributes = load_csv_files("/home/haseeb/Desktop/EKAI/ERD_automation/Dataset/train/northwind-db")            
            

Found 11 
 CSV files: ['employee_territories.csv', 'products.csv', 'orders.csv', 'customers.csv', 'territories.csv', 'orders_details.csv', 'suppliers.csv', 'employees.csv', 'categories.csv', 'shippers.csv', 'regions.csv']
Processing employee_territories.csv: 49 rows, 2 columns
Added attribute: employee_territories.employeeid Total Values: 49
1.0
Added attribute: employee_territories.territoryid Total Values: 49
0.5
Processing products.csv: 77 rows, 10 columns
Added attribute: products.productid Total Values: 77
1.0
Added attribute: products.productname Total Values: 77
0.5
Added attribute: products.supplierid Total Values: 77
0.3333333333333333
Added attribute: products.categoryid Total Values: 77
0.25
Added attribute: products.quantityperunit Total Values: 77
0.2
Added attribute: products.unitprice Total Values: 77
0.16666666666666666
Added attribute: products.unitsinstock Total Values: 77
0.14285714285714285
Added attribute: products.unitsonorder Total Values: 77
0.125
Added attribut

In [7]:
def read_IND(file_path):
    '''
        fills up the dependent and reference array of attributes
    '''
    with open(file_path, "r") as f:
        for line in f:
            vars = line.strip().split("=")
            attributes[vars[0]].references = attributes[vars[1]]
            attributes[vars[1]].dependent = attributes[vars[0]]
read_IND("/home/haseeb/Desktop/EKAI/ERD_automation/codes/inclusionDependencyWithSpider/spider_results/northwind-db.txt")

In [11]:
pk_table = {} # {table name: (column name , score)}
for key, value in attributes.items():
    exists = pk_table.get(key.split(".")[0])
    if not exists:
        pk_table[key.split(".")[0]] = (value.attribute_name, value.pkScore)
    else:
       if exists[1] < value.pkScore:
        pk_table[key.split(".")[0]] = (value.attribute_name, value.pkScore)
    # print(key + "->" + str(value.pkScore))
print(pk_table)

{'employee_territories': ('territoryid', np.float64(2.9852926422598527)), 'products': ('productid', np.float64(3.0264452376922204)), 'orders': ('orderid', np.float64(2.985186227498581)), 'customers': ('customerid', np.float64(3.0222949111208153)), 'territories': ('territoryid', np.float64(2.9567150551527552)), 'orders_details': ('orderid', np.float64(2.3794452755562983)), 'suppliers': ('supplierid', np.float64(2.9838825129959528)), 'employees': ('employeeid', np.float64(3.0180012900006483)), 'categories': ('categoryid', np.float64(3.0159583460665687)), 'shippers': ('shipperid', np.float64(3.0119051356721522)), 'regions': ('regionid', np.float64(3.0078948459609034))}
