# NASA - 01 - Clean #

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint

DEBUG = True

In [26]:
import sys, os, yaml

DATASET = "NASA"

COLAB = 'google.colab' in sys.modules
if COLAB:
    ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
else:
    ROOT = "./"

DEBUG = False
SEED = 1612

In [27]:
if COLAB:
    from google.colab import drive
    if not os.path.isdir("/content/gdrive"):
        drive.mount("/content/gdrive")
        d = "/content/gdrive/MyDrive/datasets"
        if not os.path.isdir(d): os.makedirs(d)
        if not os.path.isdir(ROOT): os.makedirs(ROOT)

def makedirs(d):
    if COLAB:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
    else:
        if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

In [28]:
filename = f"{ROOT}/orig/data.csv"
if os.path.isfile(filename):
    print("Using local copy...")
else:
    print("Downloading...")
    df = pd.read_csv("https://setu-datamining2.github.io/live/topics/21-Assignments/01-NASA_Software_Defect_Datasets/files/pc2.csv")
    df.to_csv(filename, index=False)

df = pd.read_csv(filename);
print(df.shape)
df.head()

Using local copy...
(5589, 37)


Unnamed: 0,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,defects
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0,5.33,1.5,12.0,0.0,4.0,0.67,0.67,8.0,1.0,0.0,0.0,2.0,0.5,1.0,3.0,1.0,3.0,2.0,0.0,0.0,False
1,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False
2,1.0,4.0,7.0,24.0,0.0,1.0,0.13,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,0.0,17.88,7.43,986.77,0.04,34.0,0.13,54.82,132.83,1.0,0.0,0.0,7.0,0.03,13.0,21.0,7.0,8.0,34.0,96.88,8.0,False
3,1.0,1.0,11.0,3.0,0.0,1.0,0.08,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,0.0,42.62,7.81,2598.31,0.11,77.0,0.13,144.35,332.79,1.0,0.0,0.0,3.0,0.06,29.0,48.0,13.0,7.0,17.0,93.33,12.0,False
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0,1.0,3.0,33.44,0.63,13.06,0.01,9.0,1.6,0.73,20.9,1.0,0.0,0.0,3.0,0.33,5.0,4.0,4.0,1.0,3.0,0.0,1.0,False


## Table 1 &mdash; Summary

### Bare bones approach - score max 30%

* no interpretation
* no comparison with paper

In [29]:
df.shape

(5589, 37)

### Attempt 2 - score max 40%-50%
* interpretation
* no comparison with paper

In [30]:
print("Number of cases: ", df.shape[0])
print("Number of features: ", df.shape[1])

Number of cases:  5589
Number of features:  37


### Attempt 3a - score max 100%
* interpretation
* comparison with paper

Also
* use fancier print statements

In [31]:
for message, expected, observed in [
    ("Number of cases", 5589, df.shape[0]),
    ("Number of features", 37, df.shape[1])
]:
    match = expected==observed
    print(f" * {message} {expected=} {observed=} {match=}")

 * Number of cases expected=5589 observed=5589 match=True
 * Number of features expected=37 observed=37 match=True


### Attempt 3b - score max 100%
* interpretation
* comparison with paper

Also
* use markdown

In [32]:
messages = []
for message, expected, observed in [
    ("Number of cases", 5589, df.shape[0]),
    ("Number of features", 37, df.shape[1])
]:
    match = "" if expected==observed else "FAIL"
    messages.append(f" * {message} {expected=} {observed=} {match}")

display(Markdown("\n".join(messages)))

 * Number of cases expected=5589 observed=5589 
 * Number of features expected=37 observed=37 

### Attempt 3c - score max 100%
* interpretation
* comparison with paper

Also
* use dataframe

In [33]:
data = []
for message, expected, observed in [
    ("Number of cases", 5589, df.shape[0]),
    ("Number of features", 37, df.shape[1])
]:
    data.append([message, expected, observed])

df_result = pd.DataFrame(data, columns=["Message", "Expected", "Observed"])
df_result.head()
# For bonus points add a style to highlight rows that do not match

Unnamed: 0,Message,Expected,Observed
0,Number of cases,5589,5589
1,Number of features,37,37


## Table 3 &mdash; by Features

### A &mdash; Identical Features

In [48]:
table_3_A = lambda: df.transpose().duplicated().sum()

### B &mdash; Constant Features

In [49]:
def table_3_B(df):
    pass
table_3_B_observed = table_3_B(df)

### C &mdash; Missing Values

In [50]:
def table_3_C(df):
    pass
table_3_C_observed = table_3_C(df)

In [52]:
data = []
for message, expected, observed in [
    ("Identical Features", 0, table_3_A()),
    ("Constant Features", 0, table_3_B_observed),
    ("Features with Missing Values", 0, table_3_C_observed),
]:
    data.append([message, expected, observed])

df_result = pd.DataFrame(data, columns=["Message", "Expected", "Observed"])
display(Markdown("**Table 3 (by cases) Comparison**"))
df_result.head()
# For bonus points add a style to highlight rows that do not match

**Table 3 (by cases) Comparison**

Unnamed: 0,Message,Expected,Observed
0,Identical Features,0,0.0
1,Constant Features,0,
2,Features with Missing Values,0,


<bound method DataFrame.value_counts of       BRANCH_COUNT  CALL_PAIRS  LOC_CODE_AND_COMMENT  LOC_COMMENTS  CONDITION_COUNT  CYCLOMATIC_COMPLEXITY  CYCLOMATIC_DENSITY  DECISION_COUNT  DECISION_DENSITY  DESIGN_COMPLEXITY  DESIGN_DENSITY  EDGE_COUNT  ESSENTIAL_COMPLEXITY  ESSENTIAL_DENSITY  LOC_EXECUTABLE  PARAMETER_COUNT  HALSTEAD_CONTENT  HALSTEAD_DIFFICULTY  HALSTEAD_EFFORT  HALSTEAD_ERROR_EST  HALSTEAD_LENGTH  HALSTEAD_LEVEL  HALSTEAD_PROG_TIME  HALSTEAD_VOLUME  MAINTENANCE_SEVERITY  MODIFIED_CONDITION_COUNT  MULTIPLE_CONDITION_COUNT  NODE_COUNT  NORMALIZED_CYLOMATIC_COMPLEXITY  NUM_OPERANDS  NUM_OPERATORS  NUM_UNIQUE_OPERANDS  NUM_UNIQUE_OPERATORS  NUMBER_OF_LINES  PERCENT_COMMENTS  LOC_TOTAL  defects
0              1.0         0.0                   0.0           0.0              0.0                    1.0                1.00             0.0              0.00                1.0            1.00         1.0                   1.0                0.0             0.0              2.0     