In [61]:
import os
import pandas as pd
import numpy as np
import copy

In [62]:


def create_attribute(table_name, column_name, values):
    values = np.array(sorted(str(v) for v in values if pd.notna(v)))
    unique_values = set(sorted(str(v) for v in values if pd.notna(v)))
    print(f"{len(values)=}")
    print(f"{len(unique_values)=}")
    return {
        'table_name': table_name,
        'column_name': column_name,
        'values': values,
        'unique_values': unique_values,
        'full_name': f"{table_name}.{column_name}",
        "uniqueness" : len(unique_values) / len(values)
    }

def load_csv_files(directory_path):
    attributes = []

    csv_files = [f for f in os.listdir(directory_path)]

    print(f"Found {len(csv_files)} \n CSV files: {csv_files}")

    for filename in csv_files:
        file_path = os.path.join(directory_path, filename)
        table_name = os.path.splitext(filename)[0]

        df = pd.read_csv(file_path)
        print(f"Processing {filename}: {df.shape[0]} rows, {df.shape[1]} columns")

        for column in df.columns:
            non_null_values = df[column].dropna().tolist()
            if non_null_values:
                attr = create_attribute(table_name, column, non_null_values)
                if len(attr['values']) !=0:
                    attributes.append(attr)
                print(f"Added attribute: {attr['full_name']} ({len(attr['values'])} unique values)")

    return attributes
    
attributes = load_csv_files("/home/haseeb/Desktop/EKAI/ERD_automation/Dataset/train/menagerie-db")            
            

Found 2 
 CSV files: ['event.csv', 'pet.csv']
Processing event.csv: 10 rows, 4 columns
len(values)=10
len(unique_values)=8
Added attribute: event.name (10 unique values)
len(values)=10
len(unique_values)=9
Added attribute: event.date (10 unique values)
len(values)=10
len(unique_values)=4
Added attribute: event.type (10 unique values)
len(values)=8
len(unique_values)=8
Added attribute: event.remark  (8 unique values)
Processing pet.csv: 8 rows, 6 columns
len(values)=8
len(unique_values)=8
Added attribute: pet.lexicon (8 unique values)
len(values)=8
len(unique_values)=4
Added attribute: pet.owner (8 unique values)
len(values)=8
len(unique_values)=4
Added attribute: pet.species (8 unique values)
len(values)=8
len(unique_values)=3
Added attribute: pet.sex (8 unique values)
len(values)=8
len(unique_values)=8
Added attribute: pet.birth (8 unique values)
len(values)=8
len(unique_values)=2
Added attribute: pet.death (8 unique values)


In [63]:
attributes1 = copy.deepcopy(attributes)

In [64]:
def levenshtein_distance(s1: str, s2: str) -> int:
    len_s1, len_s2 = len(s1), len(s2)

    # Create a 2D matrix to store distances
    dp = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

    # Initialize base cases
    for i in range(len_s1 + 1):
        dp[i][0] = i
    for j in range(len_s2 + 1):
        dp[0][j] = j

    # Fill in the matrix
    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            if s1[i - 1] == s2[j - 1]:
                cost = 0
            else:
                cost = 1

            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost  # Substitution
            )

    return dp[len_s1][len_s2]


In [None]:
def create_attribute_pairs(attributes, gemma, ):
    pairs = []
    for i, reference in enumerate(attributes):
        for j, dependent in enumerate(attributes):   
            if reference['table_name'] == dependent['table_name']:
                continue
            coverage = len(reference["unique_values"] & dependent["unique_values"]) / len(reference["unique_values"])
            pairs.append({
                "reference" : reference,
                "dependent" : dependent,
                "reference_uniqueness" : reference["uniqueness"],
                "coverage" : coverage,
                "dist" : levenshtein_distance(reference["column_name"], dependent["column_name"]),
                "dependent_uniqueness" :  1 if dependent["uniqueness"]>gemma else 0
            })
    return pairs
pairs = create_attribute_pairs(attributes1, 1)

In [67]:
print(pairs[1])

{'reference': {'table_name': 'event', 'column_name': 'name', 'values': array(['Bowser', 'Buffy', 'Buffy', 'Chirpy', 'Claws', 'Fang', 'Fang',
       'Fluffy', 'Slim', 'Whistler'], dtype='<U8'), 'unique_values': {'Bowser', 'Slim', 'Fang', 'Fluffy', 'Buffy', 'Chirpy', 'Whistler', 'Claws'}, 'full_name': 'event.name', 'uniqueness': 0.8}, 'dependent': {'table_name': 'pet', 'column_name': 'owner', 'values': array(['Benny', 'Benny', 'Diane', 'Gwen', 'Gwen', 'Gwen', 'Harold',
       'Harold'], dtype='<U6'), 'unique_values': {'Diane', 'Benny', 'Harold', 'Gwen'}, 'full_name': 'pet.owner', 'uniqueness': 0.5}, 'reference_uniqueness': 0.8, 'coverage': 0.0, 'dist': 4, 'dependent_uniqueness': 0}
