In [1]:
import os
import pandas as pd
import numpy as np
import copy
from fuzzywuzzy import fuzz
import timeit



In [None]:


def create_attribute(table_name, column_name, values):
    values = np.array(sorted(str(v) for v in values if pd.notna(v)))
    unique_values = set(sorted(str(v) for v in values if pd.notna(v)))
    print(f"{len(values)=}")
    print(f"{len(unique_values)=}")
    return {
        'table_name': table_name,
        'column_name': column_name,
        'values': values,
        'unique_values': unique_values,
        'full_name': f"{table_name}.{column_name}",
        "uniqueness" : len(unique_values) / len(values)
    }

def load_csv_files(directory_path):
    attributes = []

    csv_files = [f for f in os.listdir(directory_path)]

    print(f"Found {len(csv_files)} \n CSV files: {csv_files}")

    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)
        table_name = os.path.splitext(filename)[0]

        df = pd.read_csv(file_path)
        print(f"Processing {filename}: {df.shape[0]} rows, {df.shape[1]} columns")

        for column in df.columns:
            non_null_values = df[column].dropna().tolist()
            if non_null_values:
                attr = create_attribute(table_name, column, non_null_values)
                if len(attr['values']) !=0:
                    attributes.append(attr)
                print(f"Added attribute: {attr['full_name']} ({len(attr['values'])} unique values)")

    return attributes
    
attributes = load_csv_files("/home/haseeb/Desktop/EKAI/ERD_automation/Dataset/train/menagerie-db")            
            

Time with native pandas functions implementation: 0.03882544800035248


In [None]:
attributes1 = copy.deepcopy(attributes)

In [None]:
def levenshtein_distance(s1: str, s2: str) -> int:
    similarity = fuzz.partial_ratio(s1, s2)
    return similarity /100


In [None]:
def create_attribute_pairs(attributes, gemma, ):
    pairs = []
    for i, reference in enumerate(attributes):
        for j, dependent in enumerate(attributes):   
            if reference['table_name'] == dependent['table_name']:
                continue
            coverage = len(reference["unique_values"] & dependent["unique_values"]) / len(reference["unique_values"])
            pairs.append({
                "reference" : reference,
                "dependent" : dependent,
                "reference_uniqueness" : reference["uniqueness"],
                "coverage" : coverage,
                "dist" : levenshtein_distance(reference["column_name"], dependent["column_name"]),
                "dependent_uniqueness" :  1 if dependent["uniqueness"]>gemma else 0
            })
    return pairs
pairs = create_attribute_pairs(attributes1, 1)

In [None]:
print(pairs[1])