Notebook to generate data issues

In [None]:
# 1 missing_columns => one or some of the csv files are missing a required column.
# 2 missing_values => one or more values in a row are nulls.
# 3 unknown_categorical_values => expected value "dinner", "lunch", new value: "drinks"
# 4 unknown_numeric_values => wrong value for a feature, e.g:-1 for tips
# 5 bad_data_type_values => string value in a numerical column and vice versa.
# 6 bad_csv_encoding => having unreadable character in CSV file e.g: à, ù, Ö, Ü.
# 7 bad_delimiter => one or some csv files are seperated by other delimiter e.g: TSV using \t.
# 8 missing_header => one or more header are missing.

In [1]:
import pandas as pd

data2 = {
    "total_bill": [16.99, 10.34, -21.01, 23.68, 24.59],
    "tips": [1.01, None, 3.5, 3.31, 3.61],
    "sex": ["Female", "Male", "Male", "Mäle", "Female"],
    "smoker": [False, False, True, True, True],
    "day": ["Sat", "Sat", "Sun", "Sun", "Mon"],
    "time": ["Dinner", "Lunch", "Lunch", "Dinner", "Drinks"],
    "size": ["two", 3, 3, 200, 4],
}

df_missing_column = pd.read_csv(
    "../../tests/resources/test_folder_1/db_errors_test_folder/test_missing_column.csv"
)
df_error2 = pd.DataFrame(data2)
df_bad_csv = pd.read_csv(
    "../../tests/resources/test_folder_1/db_errors_test_folder/test_bad_csv_errors.csv"
)
print(df_missing_column)
print()
print(df_error2)
print()
print(df_bad_csv)

    tips   sex  smoker day    time  size
0  16.99  1.01  Female  No  Dinner     2
1  10.34  1.66    Male  No  Dinner     3
2  21.01  3.50    Male  No  Dinner     3
3  23.68  3.31    Male  No   Lunch     2
4  24.59  3.61  Female  No   Lunch     4

   total_bill  tips     sex  smoker  day    time size
0       16.99  1.01  Female   False  Sat  Dinner  two
1       10.34   NaN    Male   False  Sat   Lunch    3
2      -21.01  3.50    Male    True  Sun   Lunch    3
3       23.68  3.31    Mäle    True  Sun  Dinner  200
4       24.59  3.61  Female    True  Mon  Drinks    4

  total_bill\ttip\tsex\tsmoker\t\ttime\tsize
0    16.99\t1.01\tFemæle\tNo\tSun\tDinner\t2
1      10.34\t1.66\tMale\tNo\tSun\tDinner\t3
2       21.01\t3.5\tMàle\tNo\tSun\tDinner\t3
3      23.68\t3.31\tMale\tNo\tSun\tDinner\t2
4    24.59\t3.61\tFemale\tNo\tSun\tDinner\t4


In [9]:
import pandas as pd

In [4]:
# set up empty error dict
empty_errors = {
    "missing_header": None,
    "bad_delimiter": None,
    "bad_csv_encoding": None,
    "missing_columns": None,
    "missing_values": None,
    "unknown_categorical_values": None,
    "unknown_numeric_values": None,
    "bad_data_type_values": None,
}

In [7]:
# check if csv file have errors (missing header or bad delimiter)
def check_csv_error(filepath):
    errors = empty_errors
    expected_headers = ["total_bill", "tip", "sex", "smoker", "day", "time", "size"]
    with open(filepath, "r", encoding="utf-8") as file:
        header_line = file.readline().strip()
        data_line = file.readline().strip()
    delimiters = [",", "\t", ";", "|", " "]
    delimiter_counts = {d: header_line.count(d) for d in delimiters}
    detected_delimiter = max(delimiter_counts, key=delimiter_counts.get)
    detected_headers = [h.strip() for h in header_line.split(detected_delimiter)]
    missing_headers = list(set(expected_headers) - set(detected_headers))
    # bad delimiter checker
    if detected_delimiter != ",":
        errors["bad_delimiter"] = detected_delimiter
    # missing column and header checker
    if missing_headers:
        data_columns = data_line.split(detected_delimiter)
        num_columns = max(len(detected_headers), len(data_columns))
        if num_columns < 7:
            errors["missing_columns"] = 7 - num_columns
            # idk if to stop in cases this severe and manually deal with it or just continue to detect errors
        else:
            errors["missing_header"] = missing_headers
    ########################
    # TODO: check encoding
    ########################
    ########################
    # start checking for data problems
    # may separate check errors code here, since delimiter is now detected
    df = pd.read_csv(file, delimiter=detected_delimiter)
    # check how many values are missing
    if df.isnull().values.any():
        amount_missing = df.isnull().sum().sum()
        errors["missing_values"] = amount_missing
    ########
    # unknown_categorical_values seem to need comparison with a standard, TODO
    # https://www.kaggle.com/code/manishkc06/handling-unknown-categories-in-dataset
    ########
    # bad_data_type_values, unknown_numeric_values: check next block
    return errors


print(
    check_csv_error(
        "../../tests/resources/test_folder_1/db_errors_test_folder/test_bad_csv_errors.csv"
    )
)

{'missing_header': ['day'], 'bad_csv_format': '\t', 'bad_csv_encoding': None, 'missing_columns': None, 'missing_values': None, 'unknown_categorical_values': None, 'unknown_numeric_values': None, 'bad_data_type_values': None}


In [31]:
# check for bad_data_type_values, unknown_numeric_values
def check_bad_values(filepath):
    expected_types = {
        "total_bill": float,
        "tip": float,
        "sex": str,
        "smoker": str,
        "day": str,
        "time": str,
        "size": int,
    }

    df = pd.read_csv(filepath)
    bad_values = {}
    negative_values = {}

    for col, expected_type in expected_types.items():
        if expected_type == int:
            # int
            bad_values[col] = df.loc[
                ~df[col].astype(str).str.match(r"^-?\d+$", na=False), col
            ].tolist()
            negative_values[col] = df.loc[df[col] < 0, col].tolist()
        elif expected_type == float:
            # float
            bad_values[col] = df.loc[
                ~df[col].astype(str).str.match(r"^-?\d+(\.\d+)?$", na=False), col
            ].tolist()
            negative_values[col] = df.loc[df[col] < 0, col].tolist()
        elif expected_type == str:
            # str
            bad_values[col] = df.loc[
                df[col].astype(str).str.match(r"^-?\d+(\.\d+)?$", na=False), col
            ].tolist()

    bad_values = {col: vals for col, vals in bad_values.items() if vals}
    negative_values = {col: vals for col, vals in negative_values.items() if vals}

    if bad_values:
        print("Bad values:")
        for col, vals in bad_values.items():
            print(f"  - {col}: {vals}")
    else:
        print("No type errors")
    if negative_values:
        print("Negative values:")
        for col, vals in negative_values.items():
            print(f"  - {col}: {vals}")
    else:
        print("No negative")

In [32]:
check_bad_values(
    "../../tests/resources/test_folder_1/db_errors_test_folder/test_bad_data_type.csv"
)

TypeError: '<' not supported between instances of 'str' and 'int'

In [19]:
# code to test before putting into checkers
# currently: check bad value type
df = pd.read_csv(
    "../../tests/resources/test_folder_1/db_errors_test_folder/test_bad_data_type.csv"
)
print(df)
df.head()
df.loc[~df["size"].str.isdigit(), "size"].tolist()

   total_bill    tip     sex smoker  day    time size
0       16.99   1.01  Female     No    7  Dinner  two
1       10.34   1.66    Male      0  Sun  Dinner    3
2       21.01  three    Male     No  Sun  Dinner    3
3       23.68   3.31       1      1  Sun  Dinner    2
4       24.59   3.61  Female      1  Sun  Dinner    4


['two']