## Sample cleaning scripts

This notebook contains numerous sample usages of the error detection API

In [1]:
import errorAPI

list_of_datasets = errorAPI.Dataset.list_datasets()
print("Available datasets:")
display(list_of_datasets)
print("Amount of datasets:",len(list_of_datasets))

Available datasets:


['beers',
 'company',
 'eeg_major',
 'eeg_minor',
 'eeg_uniform',
 'flights',
 'hospital',
 'kdd_major',
 'kdd_minor',
 'kdd_uniform',
 'movie',
 'movies',
 'rayyan',
 'restaurant',
 'restaurants',
 'tax',
 'toy',
 'university',
 'uscensus_major',
 'uscensus_minor',
 'uscensus_uniform']

Amount of datasets: 21


### Listing available tools and load a sample dataset

In [2]:
creator = errorAPI.ToolCreator()
tools = creator.list_tools()
dataset_dictionary = {
    "name": "flights",
}
d = errorAPI.Dataset(dataset_dictionary)
print("Data quality:", d.get_data_quality())
print("Num rows:", len(d.dataframe))

Available tools:
['dBoost', 'FAHES', 'FDchecker', 'ForbiddenItemSets', 'Raha', 'Regex']
Data quality: 0.7041847041847042
Num rows: 2376


In [3]:
def print_scores(d, results):
    scores = d.evaluate_data_cleaning(results)
    prec = scores[0]
    rec = scores[1]
    f1 = scores[2]
    
    cprec = prec
    crec = rec
    cf1 = f1

    print("Cell Score:\t Precision=" + str(prec) + "\t Recall=" + str(rec) + "\t F1="+str(f1))

    scores = d.evaluate_detection_row_wise(results)
    prec = scores[0]
    rec = scores[1]
    f1 = scores[2]
    print("Row Score:\t Precision=" + str(prec) + "\t Recall=" + str(rec) + "\t F1="+str(f1))
    
    return (cprec, crec, cf1, prec, rec, f1)

### Default tool configs
Run the default tool configuration on the specified dataset and show its help function

In [4]:
if False:
    for tool_name in tools:
        print("-="*10)
        tool = creator.createTool(tool_name, [])
        tool.help()
        results = tool.run(d)
        try:
            print_scores(d, results)
        except:
            pass
        print()

### Example tool configs
Run more example configurations specified for the different tools

In [5]:
# run_examples_from_tools = [
#     "dBoost", 
#     "FAHES", 
#     "Raha",
#     "ForbiddenItemSets",
# ]
run_examples_from_tools = []

best_tool = ""
best_config = {}
max_f1 = 0

for tool_name in run_examples_from_tools:
    for config in creator.createTool(tool_name, []).example_configurations:
        tool = creator.createTool(tool_name, config)
        results = tool.run(d)
        
        (cprec, crec, cf1, prec, rec, f1) = print_scores(d, results)
        
        if cf1 > max_f1:
            best_tool = tool_name
            best_config = config
            max_f1 = cf1
#         break

print("Max F1:", max_f1)
print("Tool:", best_tool)
print("Best config:", best_config)

Max F1: 0
Tool: 
Best config: {}


### Show sample repairs
Show the differences of the detect errors with the real repaired dataset

In [6]:
## Print the edited values

if False:
    d.create_repaired_dataset(results)
    my_repaired_index = (d.repaired_dataframe == "JUST A DUMMY VALUE").any(axis=1)

    print("Original:")
    display(d.dataframe[my_repaired_index].head(5))
    
    print("Attempt to detect:")
    display(d.repaired_dataframe[my_repaired_index].head(5))
    
    print("Real cleaned:")
    d.create_repaired_dataset(d.actual_errors_dictionary)
    display(d.repaired_dataframe[my_repaired_index].head(5))



### Try out single tools

In [13]:
##### Try out
if True:
    tool_name = "dBoost"
    config = {"Params": ["mixture", "2", "0.7"]}
    config = {"Params": ["histogram", "0.1", "0.1"]}
    tool = creator.createTool(tool_name, config)
    tool.help()
    results = tool.run(d)
    print_scores(d, results)


Creating dBoost
Configuration arguments:
Examples: 
{"Params": ["gaussian","1.5"]}
{"Params": ["histogram", "1.5", "2.0"]}
{"Params": ["mixture", "2", "0.3"]}


>> Finding correlations
>> Building model...


Cell Score:	 Precision=0.15090673575129535	 Recall=0.09471544715447154	 F1=0.1163836163836164
Row Score:	 Precision=0.7950617283950617	 Recall=0.32723577235772355	 F1=0.4636429085673146


>> Finding outliers...
Time 0 1.3634488000006968
Runtime  1.5530548000006092


In [8]:
results

{(0, 1): 'JUST A DUMMY VALUE',
 (1, 1): 'JUST A DUMMY VALUE',
 (2, 1): 'JUST A DUMMY VALUE',
 (3, 1): 'JUST A DUMMY VALUE',
 (4, 1): 'JUST A DUMMY VALUE',
 (5, 1): 'JUST A DUMMY VALUE',
 (5, 5): 'JUST A DUMMY VALUE',
 (6, 0): 'JUST A DUMMY VALUE',
 (6, 1): 'JUST A DUMMY VALUE',
 (7, 0): 'JUST A DUMMY VALUE',
 (7, 1): 'JUST A DUMMY VALUE',
 (8, 0): 'JUST A DUMMY VALUE',
 (8, 1): 'JUST A DUMMY VALUE',
 (9, 0): 'JUST A DUMMY VALUE',
 (9, 1): 'JUST A DUMMY VALUE',
 (10, 1): 'JUST A DUMMY VALUE',
 (11, 1): 'JUST A DUMMY VALUE',
 (12, 1): 'JUST A DUMMY VALUE',
 (13, 1): 'JUST A DUMMY VALUE',
 (14, 1): 'JUST A DUMMY VALUE',
 (15, 1): 'JUST A DUMMY VALUE',
 (16, 0): 'JUST A DUMMY VALUE',
 (16, 1): 'JUST A DUMMY VALUE',
 (17, 0): 'JUST A DUMMY VALUE',
 (17, 1): 'JUST A DUMMY VALUE',
 (18, 0): 'JUST A DUMMY VALUE',
 (18, 1): 'JUST A DUMMY VALUE',
 (19, 0): 'JUST A DUMMY VALUE',
 (19, 1): 'JUST A DUMMY VALUE',
 (20, 1): 'JUST A DUMMY VALUE',
 (21, 1): 'JUST A DUMMY VALUE',
 (22, 1): 'JUST A DUMMY

In [9]:
raise Exception

Exception: 

In [None]:


Cell Score:	 Precision=0.1483409238776838	 Recall=0.09268292682926829	 F1=0.11408556417312984
Row Score:	 Precision=0.7950617283950617	 Recall=0.32723577235772355	 F1=0.4636429085673146

## Experiment run
This will run all the given example configurations on the datasets specified.
When a single experiment is done, it will be uploaded in a specified SQL schema in table 'results'

In [None]:
sql_string = 'postgresql://postgres:postgres@localhost:5432/error_detection'
experiment = errorAPI.Experiment.create_example_configs(sql_string)

In [None]:
experiment.datasets = ["flights", "toy", "beers", "rayyan", "movies", "restaurant", "uscensus_major", "eeg_major"]

In [None]:
experiment.run()

In [None]:
experiment.results_df