# NASA Defects - 01 - Import & Clean

## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [2]:
DATASET = "jm1.csv"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import

In [3]:
df = pd.read_csv(ROOT+"orig/"+DATASET)
print(df.shape)
df.head(5)

(10878, 22)


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
0,447.0,826.0,12.0,157.0,470.0,385.0,113.0,2824.0,210.28,384.45,31079782.27,26.95,8441.0,0.0,1726654.57,80843.08,3021.0,5420.0,609.0,155.0,3442.0,1
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1129.0,1
2,164.0,485.0,10.0,58.0,268.0,219.0,39.0,1588.0,202.98,213.53,9254819.86,14.45,4828.0,0.0,514156.64,43342.31,1730.0,3172.0,407.0,102.0,1824.0,1
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0,1
4,11.0,405.0,0.0,17.0,404.0,2.0,1.0,814.0,101.2,206.01,4294926.45,6.95,2033.0,0.0,238607.05,20848.47,813.0,1220.0,811.0,411.0,844.0,1


## Check Values

### Null Value Checks

Here we can see that there are no apparent null values in the dataset.

In [4]:
print("Null values in the dataset: ", df.isnull().sum().sum())

Null values in the dataset:  0


### Check Cases & Features

We are expected to see 10878 cases and 24 features. But looking at the dataset I've found that there are 10878 cases, which is correct, but there are 21 features.

In [5]:
target = "label"
features = list(df.columns)
features.remove(target)
counts_features = [col for col in df.columns if "COUNT" in col or "LOC" in col or "NUM" in col]

print(f"Target: '{target}'")
print(f"Cases: {len(df)}")
print(f"Features: {len(features)}")

Target: 'label'
Cases: 10878
Features: 21


### Implausible Value Checks

Next I need to check if the dataset contains any implausible values. The document states the following:
1. `LOC_TOTAL` is never = 0
2. values are always positive ie >=0
3. count values are always integers

Results:
Based on the results from the following 3 checks, I can see that there are no implausible values in the dataset.

#### 1) `LOC_TOTAL` is never = 0

In [6]:
# Check that `LOC_TOTAL` is always positive and non-zero
check_1 = df[df["LOC_TOTAL"] <= 0]
print(f"Check 1 is {check_1.shape[0] == 0}")

Check 1 is True


#### 2) Values are always positive ie >=0

In [7]:
# Check that all columns are positive or zero
check_2 = True
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        filter = df[col] < 0
        check_2 = check_2 and (len(df[filter]) == 0)
print(f"Check 2 is {check_2}")

Check 2 is True


#### 3) Count values are always integers

In [8]:
# Check that all count features are integers
check_3 = True
for col in counts_features:
    filter = (df[col] % 1 != 0)
    check_3 = check_3 and (len(df[filter]) == 0)
print(f"Check 3 is {check_3}")

Check 3 is True


#### Def is_implausible
This def checks for a given value and column, if it does not satisfy the above 3 conditions

In [9]:
# Check if a value is implausible for a given feature
def is_implausible(case_value, feature_name):
    if feature_name == "LOC_TOTAL":
        return case_value <= 0
    if feature_name in counts_features:
        return case_value % 1 != 0 or case_value < 0
    return case_value < 0

### Conflicting Values Checks

This def returns a list of filters that can be used to filter the dataset to find conflicting values.

It got a little out of hand because I wanted to print each filter when I tested tweaking the atol and rtol values.
The next cell will do the printing of the values I stuck with.

10. `HALSTEAD_LENGTH` = `NUM_OPERATORS` + `NUM_OPERANDS` 
    
13) `HALSTEAD_VOLUME` = (`NUM_OPERATORS` + `NUM_OPERANDS`) * log2(`NUM_UNIQUE_OPERATORS` + `NUM_UNIQUE_OPERANDS`)
14) `HALSTEAD_LEVEL` = (2 / `NUM_UNIQUE_OPERATORS`) * (`NUM_UNIQUE_OPERANDS` / `NUM_OPERANDS`)
15)  `HALSTEAD_DIFFICULTY` = (`NUM_UNIQUE_OPERATORS` / 2) * (`NUM_OPERANDS` / `NUM_UNIQUE_OPERANDS`)
16)  `HALSTEAD_CONTENT` = `HALSTEAD_VOLUME` / `HALSTEAD_DIFFICULTY`
17)  `HALSTEAD_EFFORT` = `HALSTEAD_VOLUME` * `HALSTEAD_DIFFICULTY`
18)  `HALSTEAD_PROG_TIME` = `HALSTEAD_EFFORT` / 18

In [10]:
def get_integrity_check_filters(df):
    
    # `HALSTEAD_LENGTH` = `NUM_OPERATORS` + `NUM_OPERANDS` 
    filter_10 = {
        "name" : "(10) HALSTEAD_LENGTH",
        "a" : df["HALSTEAD_LENGTH"],
        "b" : df["NUM_OPERATORS"] + df["NUM_OPERANDS"],
        "atol" : 0,
        "rtol" : 0,
        "filter" : None,
        "cols_involved" : ["NUM_OPERATORS", "NUM_OPERANDS", "HALSTEAD_LENGTH"]
    }

    # Checks for 0 values when doing the log calculation for check 13
    calculated_log_paramater = df.apply( lambda row:
        np.log2(row["NUM_UNIQUE_OPERATORS"] + row["NUM_UNIQUE_OPERANDS"]) 
        if row["NUM_UNIQUE_OPERATORS"] + row["NUM_UNIQUE_OPERANDS"] != 0 
        else 0
    , axis=1)

    # `HALSTEAD_VOLUME` = (`NUM_OPERATORS` + `NUM_OPERANDS`) * log2(`NUM_UNIQUE_OPERATORS` + `NUM_UNIQUE_OPERANDS`)
    filter_13 = {
        "name" : "(13) HALSTEAD_VOLUME",
        "a" : df["HALSTEAD_VOLUME"],
        "b" : (df["NUM_OPERATORS"] + df["NUM_OPERANDS"]) * calculated_log_paramater,
        "atol" : 0,
        "rtol" : 0.001,
        "filter" : None,
        "cols_involved" : ["NUM_OPERATORS", "NUM_OPERANDS", "NUM_UNIQUE_OPERATORS", "NUM_UNIQUE_OPERANDS", "HALSTEAD_VOLUME"]
    }

    # Checks for 0 values when doing the division for check 14
    calculated_division_parameter = df.apply( lambda row:
        (2 / row["NUM_UNIQUE_OPERATORS"]) * (row["NUM_UNIQUE_OPERANDS"] / row["NUM_OPERANDS"])
        if row["NUM_UNIQUE_OPERATORS"] != 0 and row["NUM_OPERANDS"] != 0
        else 0
    , axis=1)

    # `HALSTEAD_LEVEL` = (2 / `NUM_UNIQUE_OPERATORS`) * (`NUM_UNIQUE_OPERANDS` / `NUM_OPERANDS`)
    filter_14 = {
        "name" : "(14) HALSTEAD_LEVEL",
        "a" : df["HALSTEAD_LEVEL"],
        "b" : calculated_division_parameter,
        "atol" : 0,
        "rtol" : 0.1,
        "filter" : None,
        "cols_involved" : ["NUM_UNIQUE_OPERATORS", "NUM_OPERANDS", "NUM_UNIQUE_OPERANDS", "HALSTEAD_LEVEL"]
    }

    # Checks for 0 values when doing the division for check 15
    calculated_division_parameter_2 = df.apply( lambda row:
        (row["NUM_OPERANDS"] / row["NUM_UNIQUE_OPERANDS"])
        if row["NUM_UNIQUE_OPERANDS"] != 0
        else 0
    , axis=1)

    # `HALSTEAD_DIFFICULTY` = (`NUM_UNIQUE_OPERATORS` / 2) * (`NUM_OPERANDS` / `NUM_UNIQUE_OPERANDS`)
    filter_15 = {
        "name" : "(15) HALSTEAD_DIFFIC",
        "a" : df["HALSTEAD_DIFFICULTY"],
        "b" : ((df["NUM_UNIQUE_OPERATORS"] / 2) * calculated_division_parameter_2),
        "atol" : 0,
        "rtol" : 0.001,
        "filter" : None,
        "cols_involved" : ["NUM_OPERANDS", "NUM_UNIQUE_OPERANDS", "NUM_UNIQUE_OPERATORS", "HALSTEAD_DIFFICULTY"]
    }

    # checks for 0 values when doing the division for check 16
    calculated_division_parameter_3 = df.apply( lambda row:
        (row["HALSTEAD_VOLUME"] / row["HALSTEAD_DIFFICULTY"])
        if row["HALSTEAD_DIFFICULTY"] != 0
        else 0
    , axis=1)

    # `HALSTEAD_CONTENT` = `HALSTEAD_VOLUME` / `HALSTEAD_DIFFICULTY`
    filter_16 = {
        "name" : "(16) HALSTEAD_CONTENT",
        "a" : df["HALSTEAD_CONTENT"],
        "b" : calculated_division_parameter_3,
        "atol" : 0,
        "rtol" : 0.001,
        "filter" : None,
        "cols_involved" : ["HALSTEAD_VOLUME", "HALSTEAD_DIFFICULTY", "HALSTEAD_CONTENT"]
    }

    # `HALSTEAD_EFFORT` = `HALSTEAD_VOLUME` * `HALSTEAD_DIFFICULTY`
    filter_17 = {
        "name" : "(17) HALSTEAD_EFFORT",
        "a" : df["HALSTEAD_EFFORT"],
        "b" : (df["HALSTEAD_VOLUME"] * df["HALSTEAD_DIFFICULTY"]),
        "atol" : 0,
        "rtol" : 0.01,
        "filter" : None,
        "cols_involved" : ["HALSTEAD_VOLUME", "HALSTEAD_DIFFICULTY", "HALSTEAD_EFFORT"]
    }

    # `HALSTEAD_PROG_TIME` = `HALSTEAD_EFFORT` / 18
    filter_18 = {
        "name" : "(18) HALSTEAD_PROG_TIME",
        "a" : df["HALSTEAD_PROG_TIME"],
        "b" : (df["HALSTEAD_EFFORT"] / 18),
        "atol" : 0,
        "rtol" : 0.01,
        "filter" : None,
        "cols_involved" : ["HALSTEAD_EFFORT", "HALSTEAD_PROG_TIME"]
    }

    filters = [filter_10, filter_13, filter_14, filter_15, filter_16, filter_17, filter_18] 

    for filter in filters:
        filter["filter"] = np.isclose(filter["a"], filter["b"], atol=filter["atol"], rtol=filter["rtol"])

    return filters


### Testing the above filters

In [11]:
def assess_filter(df, filter, failure_rows):
    filter_df = df[~filter]
    filter_df_row_count = len(filter_df)
    new_filter_rows = 0
    for index, row in filter_df.iterrows():
        if index not in failure_rows:
            new_filter_rows += 1
            failure_rows.add(index)
    return filter_df_row_count, new_filter_rows

def TestFilters(df, filters):
    filter_objs = get_integrity_check_filters(df)

    failure_rows = set()

    for filter_obj in filter_objs:
        filter_df_row_count, new_filter_rows = assess_filter(df, filter_obj["filter"], failure_rows)

        print(f"{filter_obj['name']}\n"
        f"filter:\t{filter_df_row_count} rows ({new_filter_rows} new)\n"
        f"atol:\t{filter_obj['atol']}\trtol:\t{filter_obj['rtol']}\n"
        )

    print(f"Total rows with integrity check failures: {len(failure_rows)}")

TestFilters(df, get_integrity_check_filters(df))

(10) HALSTEAD_LENGTH
filter:	139 rows (139 new)
atol:	0	rtol:	0

(13) HALSTEAD_VOLUME
filter:	147 rows (10 new)
atol:	0	rtol:	0.001

(14) HALSTEAD_LEVEL
filter:	933 rows (869 new)
atol:	0	rtol:	0.1

(15) HALSTEAD_DIFFIC
filter:	253 rows (191 new)
atol:	0	rtol:	0.001

(16) HALSTEAD_CONTENT
filter:	211 rows (53 new)
atol:	0	rtol:	0.001

(17) HALSTEAD_EFFORT
filter:	0 rows (0 new)
atol:	0	rtol:	0.01

(18) HALSTEAD_PROG_TIME
filter:	0 rows (0 new)
atol:	0	rtol:	0.01

Total rows with integrity check failures: 1262


## Check data quality

### A - Identical features

Refers to a situation where two or more features contain identical values for all cases.

F1=F2=F3 ∧ F4=F5 =⇒ 3 features are identical so could be deleted.

Expected output: 0
 
Results: 0 - There are no identical features in the dataset.

In [12]:
checkA = 0

cols = df.columns
for i in range(len(cols)-2): # Go through colums 0 to n-2
    for j in range(i+1, len(cols)-1): # Go through columns 1 to n-1
        for k in range(j+1, len(cols)): # Go through columns 2 to n
            if df[cols[i]].equals(df[cols[j]]) and df[cols[j]].equals(df[cols[k]]):
                print(f"Columns {cols[i]}, {cols[j]} and {cols[k]} are identical")
                checkA += 1

print(f"Identical columns: {checkA}")

Identical columns: 0


### B - Constant features

Refers to features that contain the same value for every instance, i.e. add no information

Expected output: 0

Results: 0 - There are no constant features in the dataset.

In [13]:
checkB = 0

for col in df.columns:
    if len(df[col].unique()) == 1:
        print(f"Column {col} has the same value for all rows")
        checkB += 1

print(f"Columns with the same value for all rows: {checkB}")

Columns with the same value for all rows: 0


### C - Features with missing values

Counts the number of features that contain one or more missing observations

F1 has 10 missing values ∧ F3 has 3 missing values =⇒ 2 features contain missing values.

Expected output: 0

Results: 0 - There are no features with missing values in the dataset.

In [14]:
checkC = df.isnull().sum().sum()
for col in df.columns:
    missing_vals = df[col].isnull().sum()
    if missing_vals > 0:
        print(f"Column {col} has {missing_vals} missing values")

print(f"Total missing values: {checkC}")

Total missing values: 0


### D - Features with conflicting values

Counts features that violate some referential integrity constraint

F1 should equal F2+F3 but does not. We cannot say which feature is in error therefore =⇒ 3 problematic features.

Expected output: 9

Results: 11 - There are 11 features with conflicting values in the dataset according to the checks I've made. The checks I have are not the same as the ones in the document, but there are no way of knowing the methods used in the paper to check for conflicting values. This will pop up again in the check J.

In [15]:
conflicting_value_filters = get_integrity_check_filters(df)
conflicting_features = set()

for filter in conflicting_value_filters:
    filter_df = df[~filter["filter"]]
    conflicting_features.update(filter["cols_involved"])

checkD = len(conflicting_features)

print(f"Conflicting features (x{checkD}): {conflicting_features}")

Conflicting features (x11): {'HALSTEAD_DIFFICULTY', 'HALSTEAD_EFFORT', 'NUM_UNIQUE_OPERANDS', 'HALSTEAD_LEVEL', 'HALSTEAD_LENGTH', 'NUM_UNIQUE_OPERATORS', 'HALSTEAD_PROG_TIME', 'HALSTEAD_VOLUME', 'NUM_OPERANDS', 'HALSTEAD_CONTENT', 'NUM_OPERATORS'}


### E - Features with implausible values

Counts features that violate some integrity constraint

F1 should be non-negative but contains 1 or more instances < 0 =⇒ 1 problematic feature

Expected output: 0

Results: 0 - There are no features with implausible values in the dataset.

In [16]:
implausible_values = set()
for col in df.columns:
    for value in df[col]:
        if is_implausible(value, col):
            implausible_values.add((col, value))

checkE = len(implausible_values)

print(f"Implausible values count: {checkE}")

Implausible values count: 0


### F - Total problem features

Count of features impacted by 1 or more of A-E. Since features may contain more than one problem this need not be the sum of A to E.

Expected output: 9

Results: 11 - Because of our check D, we have 11 features with conflicting values in the dataset instead of the expected 9.

In [17]:
# Because check A, B, C, and E are 0 therefor the total count of issues is check D
if (checkA + checkB + checkC + checkE) == 0:
    print(f"Total count of issues: {checkD}")
else:
    print(f"This code needds to be updated to reflect the correct count of issues")

Total count of issues: 11


### G - Identical Cases

Refers to a situation where two or more cases contain identical values for all features including class label.

Expected output: 2628

Results: 1973 - There are 1973 identical cases in the dataset where the class label is also identical.

In [18]:
checkG = len(df[df.duplicated()])   
print(f"Identical cases in the dataset: {checkG}")

Identical cases in the dataset: 1973


### H - Inconsistent cases

As per G but the class labels differ, all other data item values are identical

There are two identical modules M1 and M2 where M1 is labelled as fault free and M2 is labelled as faulty.

Expected output: 889

Results: 2061 - There are 2061 inconsistent cases in the dataset where the class label is different.

In [19]:
temp = df.copy()
temp = temp.drop(columns=[target])
checkH = len(temp[temp.duplicated()])
print(f"Identical feature values in the dataset: {checkH}")

Identical feature values in the dataset: 2061


### I - Cases with missing values

Counts the number of cases that contain one or more missing observations

Expected output: 0

Results: 0 - There are no cases with missing values in the dataset.

In [20]:
# Get all the rows that have a missing value
missing_rows = df[df.isnull().any(axis=1)]
print(f"Rows with missing values: {len(missing_rows)}")

Rows with missing values: 0


### J - Cases with conflicting feature values

Counts cases that contain features (2 or more by definition) that violate some referential integrity constraint. Count each case irrespective of the number of features implicated

As per Column D

Expected output: 1287

Results: 1262 - Like earlier with the features, the checks used in the document are not the same as the ones I have used. Instead of using the "==" operator to check for conflicting values, I have used the np.isclose() function to check for conflicting values. The tolerences were tweaked to get as close to the expected output as possible to give a close enough result of what the researchers might have used.

In [21]:
conflicting_cases = set()
for filter in conflicting_value_filters:
    filter_df = df[~filter["filter"]]
    conflicting_cases.update(filter_df.index)

checkI = len(conflicting_cases)

print(f"Conflicting cases count: {checkI}")

Conflicting cases count: 1262


### K - Cases with implausible values

Counts cases that violate some integrity constraint. Count each case irrespective of the number of features implicated.

As per Column E

Expected output: 0

Results: 0 - There are no cases with implausible values in the dataset.

In [22]:
checkK = 0

for index, row in df.iterrows():
    for col in df.columns:
        if is_implausible(row[col], col):
            checkK += 1
            print(f"Row {index} has an implausible value {row[col]} for feature {col}")

print(f"Implausible case count: {checkK}")

Implausible case count: 0


## Preprocessing

In [28]:
def preprocess(ds, flag=False):
    # Pull out the target feature
    target_feature = ds.iloc[:, -1]
    # Drop the target feature from the dataset
    ds = ds.drop(columns=[target])

    data = None
    M = ds.shape[0]
    N = ds.shape[1] 
    print(f"Dataset shape: {M} x {N}")

    # step 1: remove cases with implausible values
    implausible_cases = set()
    for index, row in ds.iterrows():
        for col in ds.columns:
            if is_implausible(row[col], col):
                implausible_cases.add(index)
    data = ds.drop(index=implausible_cases)
    
    # step 2: remove cases with conflict feature values
    data = ds.drop(index=conflicting_cases)

    if flag:
        # step 3: remove identical cases
        data.drop_duplicates(inplace=True)

        # step 4: remove inconsistent cases
        data_without_target = data.copy()
        duplicate_indices = data_without_target.duplicated()
        data = data[~duplicate_indices]
                
    # step 5: remove cases with missing values
    data = data.dropna()
                
    # step 6: remove constant features
    data = data.loc[:, data.apply(pd.Series.nunique) != 1]
                
    # step 7: remove identical features
    data = data.T.drop_duplicates().T

    print(f"Post processed dataset shape: {data.shape[0]} x {data.shape[1]}")
    print(f"Removed {M - data.shape[0]} rows and {N - data.shape[1]} columns")

    return data

preprocess(df, True)

Dataset shape: 10878 x 21
Post processed dataset shape: 7650 x 21
Removed 3228 rows and 0 columns


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,1129.0
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0
9,0.0,83.0,0.0,0.0,42.0,16.0,27.0,0.0,0.00,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.0,0.0,0.0,186.0
10,18.0,47.0,0.0,10.0,24.0,13.0,1.0,75.0,87.74,30.06,79302.28,0.88,438.0,0.03,4405.68,2637.80,157.0,281.0,47.0,18.0,107.0
12,143.0,67.0,7.0,49.0,34.0,25.0,1.0,589.0,569.78,49.30,1385089.67,9.36,3281.0,0.02,76949.43,28092.72,1522.0,1759.0,355.0,23.0,790.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10873,2.0,7.0,0.0,0.0,4.0,4.0,1.0,13.0,32.93,7.33,1770.86,0.08,52.0,0.14,98.38,241.48,22.0,30.0,15.0,10.0,18.0
10874,2.0,3.0,0.0,0.0,2.0,2.0,1.0,5.0,15.72,8.25,1069.68,0.04,30.0,0.12,59.43,129.66,11.0,19.0,8.0,12.0,9.0
10875,10.0,7.0,0.0,1.0,4.0,2.0,1.0,29.0,19.68,26.40,13716.72,0.17,103.0,0.04,762.04,519.57,44.0,59.0,15.0,18.0,42.0
10876,2.0,1.0,0.0,0.0,1.0,1.0,1.0,6.0,17.44,8.44,1241.57,0.05,36.0,0.12,68.98,147.15,15.0,21.0,8.0,9.0,10.0


### L - Total of data quality problem cases

Count of cases impacted by one or more of I to K that we denote DS0. Since cases may contain more than one problem this need not be the sum of I to K.

In [24]:
QUALITY_L = 0

### M - Total problem cases according to [6]

Count of cases impacted by one or more of G to K denoted DS

In [25]:
QUALITY_M = 0

## Save

### Save the cleaned dataset

In [26]:
df.to_csv(ROOT+"data/"+DATASET, index=False)