## Sample cleaning scripts

This notebook contains numerous sample usages of the error detection API

In [1]:
import errorAPI
import time

list_of_datasets = errorAPI.Dataset.list_datasets()
print("Available datasets:")
display(list_of_datasets)
print("Amount of datasets:",len(list_of_datasets))

Available datasets:


['beers',
 'company',
 'eeg',
 'flights',
 'hospital',
 'kdd',
 'movie',
 'movies',
 'rayyan',
 'restaurant',
 'restaurants',
 'salaries',
 'salaries_small',
 'tax',
 'toy',
 'university',
 'uscensus']

Amount of datasets: 17


### Listing available tools and load a sample dataset

In [2]:
creator = errorAPI.ToolCreator()
tools = creator.list_tools()
dataset_dictionary = {
    "name": "beers",
}
d = errorAPI.Dataset(dataset_dictionary)
print("Data quality:", d.get_data_quality())
print("Num rows:", len(d.dataframe))

Available tools:
['ActiveClean', 'dBoost', 'FAHES', 'FDchecker', 'ForbiddenItemSets', 'KATARA', 'Raha', 'Regex']
Data quality: 0.835458317615994
Num rows: 2410


In [3]:
print(list_of_datasets)

['beers', 'company', 'eeg', 'flights', 'hospital', 'kdd', 'movie', 'movies', 'rayyan', 'restaurant', 'restaurants', 'salaries', 'salaries_small', 'tax', 'toy', 'university', 'uscensus']


### Default tool configs
Run the default tool configuration on the specified dataset and show its help function

In [4]:
if False:
    for tool_name in tools:
        print("-="*10)
        tool = creator.createTool(tool_name, [])
        tool.help()
        results = tool.run(d)
        try:
            errorAPI.Dataset.print_scores(d, results)
        except:
            pass
        print()

### Example tool configs
Run more example configurations specified for the different tools

In [5]:
# run_examples_from_tools = [
#     "dBoost", 
#     "FAHES", 
#     "Raha",
#     "ForbiddenItemSets",
# ]
run_examples_from_tools = []

best_tool = ""
best_config = {}
max_f1 = 0

for tool_name in run_examples_from_tools:
    for config in creator.createTool(tool_name, []).example_configurations:
        tool = creator.createTool(tool_name, config)
        results = tool.run(d)
        
        (cprec, crec, cf1, prec, rec, f1) = errorAPI.Dataset.print_scores(d, results)
        
        if cf1 > max_f1:
            best_tool = tool_name
            best_config = config
            max_f1 = cf1
#         break

print("Max F1:", max_f1)
print("Tool:", best_tool)
print("Best config:", best_config)

Max F1: 0
Tool: 
Best config: {}


### Show sample repairs
Show the differences of the detect errors with the real repaired dataset

In [6]:
## Print the edited values

if False:
    d.create_repaired_dataset(results)
    my_repaired_index = (d.repaired_dataframe == "JUST A DUMMY VALUE").any(axis=1)

    print("Original:")
    display(d.dataframe[my_repaired_index].head(5))
    
    print("Attempt to detect:")
    display(d.repaired_dataframe[my_repaired_index].head(5))
    
    print("Real cleaned:")
    d.create_repaired_dataset(d.actual_errors_dictionary)
    display(d.repaired_dataframe[my_repaired_index].head(5))



### Try out single tools

In [22]:
dataset_dictionary = {
    "name": "uscensus",
}
d = errorAPI.Dataset(dataset_dictionary)

In [None]:
##### Try out
if True:
    tool_name = "ForbiddenItemSets"
    
    config = {
        "Tau": 0.7
    }
#     config = {"Params": ["mixture", "3", "0.7"]}
#     config = {"Params": ["histogram", "0.1", "0.1"]}
#     config = {"Params": ["gaussian", "1.5"]}
#     config = {}
    start = time.time()
    tool = creator.createTool(tool_name, config)
    tool.help()
    results = tool.run(d)
    print("Results len:", len(results))
    errorAPI.Dataset.print_scores(d, results)
    print("This took {:.2f} seconds".format(time.time() - start))


Configuration arguments:
Examples: 


In [None]:
print(found_left_col, "<=>", found_left_val)
print(found_right_col, "<=>", found_right_val)


In [None]:
cur_line