## Sample cleaning scripts

This notebook contains numerous sample usages of the error detection API

In [1]:
import errorAPI

list_of_datasets = errorAPI.Dataset.list_datasets()
print("Available datasets:")
display(list_of_datasets)
print("Amount of datasets:",len(list_of_datasets))

Available datasets:


['beers',
 'company',
 'eeg_major',
 'eeg_minor',
 'eeg_uniform',
 'flights',
 'hospital',
 'kdd_major',
 'kdd_minor',
 'kdd_uniform',
 'movie',
 'movies',
 'rayyan',
 'restaurant',
 'restaurants',
 'tax',
 'toy',
 'university',
 'uscensus_major',
 'uscensus_minor',
 'uscensus_uniform']

Amount of datasets: 21


### Listing available tools and load a sample dataset

In [2]:
creator = errorAPI.ToolCreator()
tools = creator.list_tools()
dataset_dictionary = {
    "name": "flights",
}
d = errorAPI.Dataset(dataset_dictionary)
print("Data quality:", d.get_data_quality())
print("Num rows:", len(d.dataframe))

Available tools:
['dBoost', 'FAHES', 'FDchecker', 'ForbiddenItemSets', 'Raha', 'Regex']
Data quality: 0.7041847041847042
Num rows: 2376


### Default tool configs
Run the default tool configuration on the specified dataset and show its help function

In [4]:
if False:
    for tool_name in tools:
        print("-="*10)
        tool = creator.createTool(tool_name, [])
        tool.help()
        results = tool.run(d)
        try:
            errorAPI.Dataset.print_scores(d, results)
        except:
            pass
        print()

### Example tool configs
Run more example configurations specified for the different tools

In [5]:
# run_examples_from_tools = [
#     "dBoost", 
#     "FAHES", 
#     "Raha",
#     "ForbiddenItemSets",
# ]
run_examples_from_tools = []

best_tool = ""
best_config = {}
max_f1 = 0

for tool_name in run_examples_from_tools:
    for config in creator.createTool(tool_name, []).example_configurations:
        tool = creator.createTool(tool_name, config)
        results = tool.run(d)
        
        (cprec, crec, cf1, prec, rec, f1) = errorAPI.Dataset.print_scores(d, results)
        
        if cf1 > max_f1:
            best_tool = tool_name
            best_config = config
            max_f1 = cf1
#         break

print("Max F1:", max_f1)
print("Tool:", best_tool)
print("Best config:", best_config)

Max F1: 0
Tool: 
Best config: {}


### Show sample repairs
Show the differences of the detect errors with the real repaired dataset

In [6]:
## Print the edited values

if False:
    d.create_repaired_dataset(results)
    my_repaired_index = (d.repaired_dataframe == "JUST A DUMMY VALUE").any(axis=1)

    print("Original:")
    display(d.dataframe[my_repaired_index].head(5))
    
    print("Attempt to detect:")
    display(d.repaired_dataframe[my_repaired_index].head(5))
    
    print("Real cleaned:")
    d.create_repaired_dataset(d.actual_errors_dictionary)
    display(d.repaired_dataframe[my_repaired_index].head(5))



### Try out single tools

In [None]:
##### Try out
if False:
    tool_name = "dBoost"
    config = {"Params": ["mixture", "3", "0.7"]}
#     config = {"Params": ["histogram", "0.1", "0.1"]}
#     config = {"Params": ["gaussian", "1.5"]}
    tool = creator.createTool(tool_name, config)
    tool.help()
    results = tool.run(d)
    errorAPI.Dataset.print_scores(d, results)


## Experiment run
This will run all the given example configurations on the datasets specified.
When a single experiment is done, it will be uploaded in a specified SQL schema in table 'results'

In [9]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
experiment = errorAPI.Experiment.create_example_configs(sql_string)

Available tools:
['dBoost', 'FAHES', 'FDchecker', 'ForbiddenItemSets', 'Raha', 'Regex']
Creating dBoost
Creating FAHES
Creating FDchecker
Creating FDchecker
Creating ForbiddenItemSets
Creating Raha
Creating Regex
Creating Regex


In [10]:
experiment.datasets = ["flights", "toy", "beers", "rayyan", "movies", "restaurant", "uscensus_major", "eeg_major"]

In [11]:
experiment.run()

Running all experiments
Testing on: flights
Tool: dBoost - {'Params': ['histogram', '0.1', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.5']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.7']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.9']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.5']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.7']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.9']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.5', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.5', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.5', '0.5']}
Creating dBoost
Tool: dBoost - {'Params': ['histo

Tool: ForbiddenItemSets - {'Tau': 0.3}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.4}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.5}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.6}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.7}
Creating ForbiddenItemSets
Tool: Raha - {}
Creating Raha
Tool: Regex - {}
Creating Regex
Testing on: beers
Tool: dBoost - {'Params': ['histogram', '0.1', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.5']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.7']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.9']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.3', '0.5']}
Creating dBoost
Tool: dBoost - {'Param

Tool: FAHES - {'Algo': 1, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 2, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 3, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 4, 'AllMissing': False}
Creating FAHES
Tool: FDchecker - {}
Creating FDchecker
Tool: ForbiddenItemSets - {'Tau': 0.1}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.2}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.3}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.4}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.5}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.6}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.7}
Creating ForbiddenItemSets
Tool: Raha - {}
Creating Raha
Tool: Regex - {}
Creating Regex
Testing on: movies
Tool: dBoost - {'Params': ['histogram', '0.1', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['histogram', '0.1', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': [

Tool: dBoost - {'Params': ['mixture', '2', '0.9']}
Creating dBoost
Tool: dBoost - {'Params': ['mixture', '3', '0.1']}
Creating dBoost
Tool: dBoost - {'Params': ['mixture', '3', '0.3']}
Creating dBoost
Tool: dBoost - {'Params': ['mixture', '3', '0.5']}
Creating dBoost
Tool: dBoost - {'Params': ['mixture', '3', '0.7']}
Creating dBoost
Tool: dBoost - {'Params': ['mixture', '3', '0.9']}
Creating dBoost
Tool: FAHES - {'Algo': 1, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 2, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 3, 'AllMissing': False}
Creating FAHES
Tool: FAHES - {'Algo': 4, 'AllMissing': False}
Creating FAHES
Tool: FDchecker - {}
Creating FDchecker
Tool: ForbiddenItemSets - {'Tau': 0.1}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.2}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.3}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.4}
Creating ForbiddenItemSets
Tool: ForbiddenItemSets - {'Tau': 0.5}
Creatin


KeyboardInterrupt



In [None]:
experiment.results_df