In [None]:
# import sys
# import importlib

In [None]:
# importlib.reload(sys.modules['flattableanalysis'])
# importlib.reload(sys.modules['flattableanalysis.flat_table_analysis'])
# importlib.reload(sys.modules['flattableanalysis.utils']);

In [1]:
import numpy as np
import pandas as pd
from numpy.random import default_rng; rng = default_rng(42)
import math
import random
import seaborn as sns
from IPython.display import clear_output
import itertools as it
import more_itertools as mit
import datetime

from flattableanalysis.flat_table_analysis import FlatTableAnalysis
from flattableanalysis.utils import wrap_text, powerset, cut_strings, make_unique

# Test utils

In [2]:
assert wrap_text('0123456789', 5) == '01234\n56789'

In [3]:
assert powerset(10, 2) == math.comb(10, 1) + math.comb(10, 2)

In [4]:
try:
    cut_strings(['a', 'a'])
except ValueError as e:
    pass
else:
    raise Exception

In [5]:
try:
    cut_strings([1, 2])
except TypeError as e:
    pass
else:
    raise Exception

In [6]:
assert cut_strings(['abc', 'abcde'], 0) == ['abc', 'abcd']

In [7]:
assert make_unique([1, 2, 3, 1, 2, 2]).values.tolist() == ['1_0', '2_0', '3', '1_1', '2_1', '2_2']

# Test flat_table_analysis

In [9]:
table = sns.load_dataset("titanic", cache=True).drop_duplicates().reset_index(drop=True)

### init class

In [8]:
for name in sns.get_dataset_names():
    FlatTableAnalysis(sns.load_dataset(name, cache=True))
clear_output()

### candidate key test

In [10]:
data = random.sample(list(it.product(range(10), range(10, 20), range(20, 30), range(30, 40))), k=len(table))
temp = pd.DataFrame(data)

df = pd.concat([table, temp], axis=1)

df = df.sample(frac=1)

idxs = list(range(df.shape[1]))
random.shuffle(idxs)
df = df.iloc[:, idxs]

fta = FlatTableAnalysis(df)
candidate_keys = set(
    fta
    .get_candidate_keys(4)
    .loc[lambda df: df['uniques_frac']==1]
    .loc[:, 'col_names']
)

clear_output()
assert set(['0', '1', '2', '3']) in {frozenset(key) for key in candidate_keys}

### speed time

In [11]:
start_time = datetime.datetime.now()

df = pd.DataFrame(rng.integers(0, 100_000, size=(50_000, 100)))
fta = FlatTableAnalysis(df)
fta.get_candidate_keys(2).head(3)

clear_output()
if datetime.datetime.now() - start_time > datetime.timedelta(minutes=1, seconds=30):
    raise Exception

### get determinant test

In [12]:
df_1 = pd.DataFrame(random.sample(list(it.product(range(10), range(10, 20), range(20, 30))), 
                                  k=len(table)//2))
df_2 = pd.DataFrame(rng.integers(1, 100, size=len(table)//2), columns=['target'])
df_3 = pd.concat([df_1, df_2], axis=1)

df_3 = df_3.sample(len(table), replace=True).reset_index(drop=True)

df = pd.concat([table, df_3], axis=1)

df = df.sample(frac=1)

idxs = list(range(df.shape[1]))
random.shuffle(idxs)
df = df.iloc[:, idxs]

fta = FlatTableAnalysis(df)
answer = (
    fta
    .get_cols_determinants(target='target', max_cols=3)
    .loc[lambda df: df['frac'] == 1]
    .loc[:, 'col_names']
)
assert set(*answer) == {'0', '1', '2'}
clear_output()

### set relation test

In [13]:
df_1 = pd.DataFrame(random.sample(list(it.product(range(10), range(10, 20), range(20, 30))), 
                                  k=len(table)//2))
df_2 = pd.DataFrame(rng.integers(1, 100, size=len(table)//2), columns=['target'])
df_3 = pd.concat([df_1, df_2], axis=1)

df_3 = df_3.sample(len(table), replace=True).reset_index(drop=True)

df = pd.concat([table, df_3], axis=1)

df = df.sample(frac=1)

idxs = list(range(df.shape[1]))
random.shuffle(idxs)
df = df.iloc[:, idxs]

fta = FlatTableAnalysis(df)
fta.show_set_relation(['0', '1', '2'], 'target')

# should be:
# nodes: left fd -> 1.00000
# edges: left fd -> 1.00000

remove_constant_columns:  []
remove_all_unique_columns:  []
remove_one_one_relations:  ['embarked', 'class', 'survived']
    found these sets of one-one relations, keep only 1st item from each:  [['embark_town', 'embarked'], ['pclass', 'class'], ['alive', 'survived']]
left unique 345, right unique 95, edges 345 (0.01053%)
nodes: left fd -> 1.00000, right fd -> 0.10526
edges: left fd -> 1.00000, right fd -> 0.02899


# Sandbox

In [None]:
body = """
1 1 1 1 b 4
1 2 2 2 b 5
1 3 2 2 b 5
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
display(test)
FlatTableAnalysis(test)

In [None]:
test =  pd.DataFrame([[1], [2], [2]]).astype({0: object})
test.iloc[0, 0] = object()
display(test)
fta = FlatTableAnalysis(test)z
fta

In [None]:
body = """
1 1 1
1 1 1
1 1 1
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
display(test)
FlatTableAnalysis(test)

In [None]:
test =  pd.DataFrame([[1, 2, 3], [4, 5, 6]]).astype(float)
test[:] = np.nan
display(test)
FlatTableAnalysis(test)

In [None]:
test =  pd.DataFrame([[1, 2, 3], [4, 5, 6]]).astype(object)
test[:] = None
display(test)
fta = FlatTableAnalysis(test)
fta

In [None]:
body = """
1 2 4
1 3 4
1 3 5
1 3 6
1 3 7
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
test.iloc[[0, 1], 2] = np.nan
display(test)
fta = FlatTableAnalysis(test)
display(fta.df)
fta.get_candidate_keys()

In [None]:
body = """
1 2 1
1 3 2
2 4 3
2 4 5
2 4 5
2 6 6
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
display(test)
fta = FlatTableAnalysis(test)
display(fta.df)
fta.show_fd_graph()[0]

In [None]:
body = """
1 1
2 2
3 3
3 4
4 5
5 5
6 7
7 7
7 9
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
display(test)
fta = FlatTableAnalysis(test)
display(fta.df)
fta.show_set_relation()

In [None]:
body = """
1 2 9 1
1 2 1 2
1 3 2 1
1 4 4 1
2 5 3 1
2 5 3 2
""".strip().splitlines()
test = pd.DataFrame([line.split() for line in body])
display(test)
fta = FlatTableAnalysis(test)
display(fta.df)
fta.get_cols_determinants(2, 2)