# NASA Defects - 01 - Import & Clean

## Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [2]:
DATASET = "jm1.csv"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import

In [3]:
df = pd.read_csv(ROOT+"orig/"+DATASET)
print(df.shape)
df.head(5)

(10878, 22)


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
0,447.0,826.0,12.0,157.0,470.0,385.0,113.0,2824.0,210.28,384.45,31079782.27,26.95,8441.0,0.0,1726654.57,80843.08,3021.0,5420.0,609.0,155.0,3442.0,1
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1129.0,1
2,164.0,485.0,10.0,58.0,268.0,219.0,39.0,1588.0,202.98,213.53,9254819.86,14.45,4828.0,0.0,514156.64,43342.31,1730.0,3172.0,407.0,102.0,1824.0,1
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0,1
4,11.0,405.0,0.0,17.0,404.0,2.0,1.0,814.0,101.2,206.01,4294926.45,6.95,2033.0,0.0,238607.05,20848.47,813.0,1220.0,811.0,411.0,844.0,1


## Check Values

### Null Value Checks

Here we can see that there are no apparent null values in the dataset.

In [4]:
df.isnull().sum()

LOC_BLANK                0
BRANCH_COUNT             0
LOC_CODE_AND_COMMENT     0
LOC_COMMENTS             0
CYCLOMATIC_COMPLEXITY    0
DESIGN_COMPLEXITY        0
ESSENTIAL_COMPLEXITY     0
LOC_EXECUTABLE           0
HALSTEAD_CONTENT         0
HALSTEAD_DIFFICULTY      0
HALSTEAD_EFFORT          0
HALSTEAD_ERROR_EST       0
HALSTEAD_LENGTH          0
HALSTEAD_LEVEL           0
HALSTEAD_PROG_TIME       0
HALSTEAD_VOLUME          0
NUM_OPERANDS             0
NUM_OPERATORS            0
NUM_UNIQUE_OPERANDS      0
NUM_UNIQUE_OPERATORS     0
LOC_TOTAL                0
label                    0
dtype: int64

### Check Cases & Features

In [5]:
target = "label"
features = list(df.columns)
features.remove(target)
counts_features = [col for col in df.columns if "COUNT" in col or "LOC" in col or "NUM" in col]

print(f"Target: '{target}'")
print(f"Cases: {len(df)}")
print(f"Features: {len(features)}")

Target: 'label'
Cases: 10878
Features: 21


### Implausible Value Checks

Note:
1. `LOC_TOTAL` is never = 0
2. values are always positive ie >=0
3. count values are always integers

In [6]:
# Check that `LOC_TOTAL` is always positive and non-zero
check_1 = df[df["LOC_TOTAL"] <= 0]
print(f"Check 1 is {check_1.shape[0] == 0}")

Check 1 is True


In [7]:
# Check that all columns are positive or zero
check_2 = True
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        filter = df[col] < 0
        check_2 = check_2 and (len(df[filter]) == 0)
print(f"Check 2 is {check_2}")

Check 2 is True


In [8]:
# Check that all count features are integers
check_3 = True
for col in counts_features:
    filter = (df[col] % 1 != 0)
    check_3 = check_3 and (len(df[filter]) == 0)
print(f"Check 3 is {check_3}")

Check 3 is True


In [9]:
# Check if a value is implausible for a given feature
def is_implausible(case_value, feature_name):
    if feature_name == "LOC_TOTAL":
        return case_value <= 0
    if feature_name in counts_features:
        return case_value % 1 != 0 or case_value < 0
    return case_value < 0

## Check data quality

### A - Identical features

Refers to a situation where two or more features contain identical values for all cases.

F1=F2=F3 ∧ F4=F5 =⇒ 3 features are identical so could be deleted.

Expected output: 0
 
Results: 

In [10]:
identical_column_count = 0
cols = df.columns
for i in range(len(cols)-2): # Go through colums 0 to n-2
    for j in range(i+1, len(cols)-1): # Go through columns 1 to n-1
        for k in range(j+1, len(cols)): # Go through columns 2 to n
            if df[cols[i]].equals(df[cols[j]]) and df[cols[j]].equals(df[cols[k]]):
                print(f"Columns {cols[i]}, {cols[j]} and {cols[k]} are identical")
                identical_column_count += 1

print(f"Identical columns: {identical_column_count}")

Identical columns: 0


### B - Constant features

Refers to features that contain the same value for every instance, i.e. add no information

In [11]:
count = 0
for col in df.columns:
    if len(df[col].unique()) == 1:
        print(f"Column {col} has the same value for all rows")
        count += 1

print(f"Columns with the same value for all rows: {count}")

Columns with the same value for all rows: 0


### C - Features with missing values

Counts the number of features that contain one or more missing observations

F1 has 10 missing values ∧ F3 has 3 missing values =⇒ 2 features contain missing values.

In [12]:
total_missing = df.isnull().sum().sum()
for col in df.columns:
    missing_vals = df[col].isnull().sum()
    if missing_vals > 0:
        print(f"Column {col} has {missing_vals} missing values")

print(f"Total missing values: {total_missing}")

Total missing values: 0


### D - Features with conflicting values

Counts features that violate some referential integrity constraint

F1 should equal F2+F3 but does not. We cannot say which feature is in error therefore =⇒ 3 problematic features.

According to the document the following features should be equal:

10. `HALSTEAD_LENGTH` = `NUM_OPERATORS` + `NUM_OPERANDS` 
    
13) `HALSTEAD_VOLUME` = (`NUM_OPERATORS` + `NUM_OPERANDS`) * log2(`NUM_UNIQUE_OPERATORS` + `NUM_UNIQUE_OPERANDS`)
14) `HALSTEAD_LEVEL` = (2 / `NUM_UNIQUE_OPERATORS`) * (`NUM_UNIQUE_OPERANDS` / `NUM_OPERANDS`)
15)  `HALSTEAD_DIFFICULTY` = (`NUM_UNIQUE_OPERATORS` / 2) * (`NUM_OPERANDS` / `NUM_UNIQUE_OPERANDS`)
16)  `HALSTEAD_CONTENT` = `HALSTEAD_VOLUME` / `HALSTEAD_DIFFICULTY`
17)  `HALSTEAD_EFFORT` = `HALSTEAD_VOLUME` * `HALSTEAD_DIFFICULTY`


In [13]:
integrity_check_filter10 = None
integrity_check_filter13 = None
integrity_check_filter14 = None
integrity_check_filter15 = None
integrity_check_filter16 = None
integrity_check_filter17 = None

def set_conflicting_value_checks(df, rounding=2, truncate=2 , round_or_truncate="round"):
    
    # `HALSTEAD_LENGTH` = `NUM_OPERATORS` + `NUM_OPERANDS` 
    integrity_check_filter10 = (df["HALSTEAD_LENGTH"] == df["NUM_OPERATORS"] + df["NUM_OPERANDS"])
    # `HALSTEAD_VOLUME` = (`NUM_OPERATORS` + `NUM_OPERANDS`) * log2(`NUM_UNIQUE_OPERATORS` + `NUM_UNIQUE_OPERANDS`)
    integrity_check_filter13 = (df["HALSTEAD_VOLUME"] == ((df["NUM_OPERATORS"] + df["NUM_OPERANDS"]) * np.log2(df["NUM_UNIQUE_OPERATORS"] + df["NUM_UNIQUE_OPERANDS"])).round(rounding))
    # `HALSTEAD_LEVEL` = (2 / `NUM_UNIQUE_OPERATORS`) * (`NUM_UNIQUE_OPERANDS` / `NUM_OPERANDS`)
    integrity_check_filter14 = (df["HALSTEAD_LEVEL"] == ((2 / df["NUM_UNIQUE_OPERATORS"]) * (df["NUM_UNIQUE_OPERANDS"] / df["NUM_OPERANDS"])).round(rounding))
    # `HALSTEAD_DIFFICULTY` = (`NUM_UNIQUE_OPERATORS` / 2) * (`NUM_OPERANDS` / `NUM_UNIQUE_OPERANDS`)
    integrity_check_filter15 = (df["HALSTEAD_DIFFICULTY"] == ((df["NUM_UNIQUE_OPERATORS"] / 2) * (df["NUM_OPERANDS"] / df["NUM_UNIQUE_OPERANDS"])).round(rounding))
    # `HALSTEAD_CONTENT` = `HALSTEAD_VOLUME` / `HALSTEAD_DIFFICULTY`
    integrity_check_filter16 = (df["HALSTEAD_CONTENT"] == (df["HALSTEAD_VOLUME"] / df["HALSTEAD_DIFFICULTY"]).round(rounding))
    # `HALSTEAD_EFFORT` = `HALSTEAD_VOLUME` * `HALSTEAD_DIFFICULTY`
    integrity_check_filter17 = (df["HALSTEAD_EFFORT"] == (df["HALSTEAD_VOLUME"] * df["HALSTEAD_DIFFICULTY"]).round(rounding))


TypeError: bad operand type for unary ~: 'NoneType'

### E - Features with implausible values

Counts features that violate some integrity constraint

F1 should be non-negative but contains 1 or more instances < 0 =⇒ 1 problematic feature

In [None]:
def filter_quality_E(df):
    implausible_values = {}
    for col in df.columns:
        implausible_values[col] = df[df[col].apply(lambda x: is_implausible(x, col))]
    
    total_implausible = sum([len(implausible_values[col]) for col in implausible_values])
    print(f"Total implausible values: {total_implausible}")

    return df

filter_quality_E(df.copy())
print() # to stop the output from the function

Total implausible values: 0



### F - Total problem features

Count of features impacted by 1 or more of A-E. Since features may contain more than one problem this need not be the sum of A to E .

In [None]:
def filter_quality_F(df):
    print("Removing rows with missing values")

### G - Identical Cases

Refers to a situation where two or more cases contain identical values for all features including class label.

In [None]:
QUALITY_G = 0

### H - Inconsistent cases

As per G but the class labels differ, all other data item values are identical

There are two identical modules M1 and M2 where M1 is labelled as fault free and M2 is labelled as faulty.

In [None]:
QUALITY_H = 0

### I - Cases with missing values

Counts the number of cases that contain one or more missing observations

In [None]:
QUALITY_I = 0

### J - Cases with conflicting feature values

Counts cases that contain features (2 or more by definition) that violate some referential integrity constraint. Count each case irrespective of the number of features implicated

As per Column D

In [None]:
def filter_quality_J(df):
    indeces = set()

    filters = [
        integrity_check_filter10,
        integrity_check_filter13,
        integrity_check_filter14,
        integrity_check_filter15,
        integrity_check_filter16,
        integrity_check_filter17,
    ]

    for row in df.iterrows():
        row_index, row_data = row
        for filter in filters:
            if not filter[row_index]:
                indeces.add(row_index)

    print(f"Removing {len(indeces)} rows")
    return df.drop(indeces)

filter_quality_J(df.copy())

Removing 10703 rows


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
224,6.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,10.0,1
314,6.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,11.0,1
544,3.0,1.0,0.0,6.0,1.0,1.0,1.0,6.0,49.6,2.5,310.0,0.04,31.0,0.4,17.22,124.0,15.0,16.0,12.0,4.0,17.0,1
554,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,15.0,2.0,60.0,0.01,10.0,0.5,3.33,30.0,4.0,6.0,4.0,4.0,4.0,1
555,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,15.0,2.0,60.0,0.01,10.0,0.5,3.33,30.0,4.0,6.0,4.0,4.0,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10808,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,4.0,0
10810,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,4.0,0
10812,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,4.0,0
10813,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0,13.5,2.0,54.0,0.01,9.0,0.5,3.00,27.0,4.0,5.0,4.0,4.0,4.0,0


### K - Cases with implausible values

Counts cases that violate some integrity constraint. Count each case irrespective of the number of features implicated.

As per Column E

In [None]:
QUALITY_K = 0

## Preprocessing

In [None]:
def preprocess(ds):
    # Pull out the target feature
    target_feature = ds.iloc[:, -1]
    # Drop the target feature from the dataset
    ds = ds.iloc[:, :-1]

    data = None
    M = ds.shape[0]
    N = ds.shape[1] 
    print(f"Dataset shape: {M} x {N}")

    # step 1: remove cases with implausible values
    # Not Required as the dataset has no implausible values as per the checks above
    
    # step 2: remove cases with conflict feature values
    

    # step 3: remove identical cases
    

    # step 4: remove inconsistent cases
                
    # step 5: remove cases with missing values
                
    # step 6: remove constant features
                
    # step 7: remove identical features
    

preprocess(df)

Dataset shape: 10878 x 21


### L - Total of data quality problem cases

Count of cases impacted by one or more of I to K that we denote DS0. Since cases may contain more than one problem this need not be the sum of I to K.

In [None]:
QUALITY_L = 0

### M - Total problem cases according to [6]

Count of cases impacted by one or more of G to K denoted DS

In [None]:
QUALITY_M = 0