# NASA Defects - 01 - Import & Clean

## Setup

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 666

In [3]:
DATASET = "jm1.csv"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import

In [4]:
df = pd.read_csv(ROOT+"orig/"+DATASET)
print(df.shape)
df.head(5)

(10878, 22)


Unnamed: 0,LOC_BLANK,BRANCH_COUNT,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CYCLOMATIC_COMPLEXITY,DESIGN_COMPLEXITY,ESSENTIAL_COMPLEXITY,LOC_EXECUTABLE,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,LOC_TOTAL,label
0,447.0,826.0,12.0,157.0,470.0,385.0,113.0,2824.0,210.28,384.45,31079782.27,26.95,8441.0,0.0,1726654.57,80843.08,3021.0,5420.0,609.0,155.0,3442.0,1
1,0.0,211.0,0.0,0.0,128.0,104.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1129.0,1
2,164.0,485.0,10.0,58.0,268.0,219.0,39.0,1588.0,202.98,213.53,9254819.86,14.45,4828.0,0.0,514156.64,43342.31,1730.0,3172.0,407.0,102.0,1824.0,1
3,37.0,29.0,8.0,42.0,19.0,19.0,6.0,133.0,108.14,46.32,232043.52,1.67,685.0,0.02,12891.31,5009.32,295.0,390.0,121.0,38.0,222.0,1
4,11.0,405.0,0.0,17.0,404.0,2.0,1.0,814.0,101.2,206.01,4294926.45,6.95,2033.0,0.0,238607.05,20848.47,813.0,1220.0,811.0,411.0,844.0,1


Here we can see that there are no null values in the dataset.

In [5]:
df.isnull().sum()

LOC_BLANK                0
BRANCH_COUNT             0
LOC_CODE_AND_COMMENT     0
LOC_COMMENTS             0
CYCLOMATIC_COMPLEXITY    0
DESIGN_COMPLEXITY        0
ESSENTIAL_COMPLEXITY     0
LOC_EXECUTABLE           0
HALSTEAD_CONTENT         0
HALSTEAD_DIFFICULTY      0
HALSTEAD_EFFORT          0
HALSTEAD_ERROR_EST       0
HALSTEAD_LENGTH          0
HALSTEAD_LEVEL           0
HALSTEAD_PROG_TIME       0
HALSTEAD_VOLUME          0
NUM_OPERANDS             0
NUM_OPERATORS            0
NUM_UNIQUE_OPERANDS      0
NUM_UNIQUE_OPERATORS     0
LOC_TOTAL                0
label                    0
dtype: int64

## Check Cases & Features

In [18]:
target = "label"
features = list(df.columns)
features.remove(target)

print(f"Target: '{target}'")
print(f"Cases: {len(df)}")
print(f"Features: {len(features)}")

Target: 'label'
Cases: 10878
Features: 21


## Check data quality

### A - Identical features

Refers to a situation where two or more features contain identical values for all cases.

F1=F2=F3 ∧ F4=F5 =⇒ 3 features are identical so could be deleted.

Expected output:
 
Results: 

In [None]:
QUALITY_A = 0

### B - Constant features

Refers to features that contain the same value for every instance, i.e. add no information

In [None]:
QUALITY_B = 0

### C - Features with missing values

Counts the number of features that contain one or more missing observations

F1 has 10 missing values ∧ F3 has 3 missing values =⇒ 2 features contain missing values.

In [None]:
QUALITY_C = 0

### D - Features with conflicting values

Counts features that violate some referential integrity constraint

F1 should equal F2+F3 but does not. We cannot say which feature is in error therefore =⇒ 3 problematic features.

In [None]:
QUALITY_D = 0

### E - Features with implausible values

Counts features that violate some integrity constraint

F1 should be non-negative but contains 1 or more instances < 0 =⇒ 1 problematic feature

In [None]:
QUALITY_E = 0

### F - Total problem features

Count of features impacted by 1 or more of A-E. Since features may contain more than one problem this need not be the sum of A to E .

In [None]:
QUALITY_F = 0

### G - Identical Cases

Refers to a situation where two or more cases contain identical values for all features including class label.

In [None]:
QUALITY_G = 0

### H - Inconsistent cases

As per G but the class labels differ, all other data item values are identical

There are two identical modules M1 and M2 where M1 is labelled as fault free and M2 is labelled as faulty.

In [None]:
QUALITY_H = 0

### I - Cases with missing values

Counts the number of cases that contain one or more missing observations

In [None]:
QUALITY_I = 0

### J - Cases with conflicting feature values

Counts cases that contain features (2 or more by definition) that violate some referential integrity constraint. Count each case irrespective of the number of features implicated

As per Column D

In [None]:
QUALITY_J = 0

### K - Cases with implausible values

Counts cases that violate some integrity constraint. Count each case irrespective of the number of features implicated.

As per Column E

In [None]:
QUALITY_K = 0

### L - Total of data quality problem cases

Count of cases impacted by one or more of I to K that we denote DS0. Since cases may contain more than one problem this need not be the sum of I to K.

In [None]:
QUALITY_L = 0

### M - Total problem cases according to [6]

Count of cases impacted by one or more of G to K denoted DS

In [None]:
QUALITY_M = 0