In [None]:
import numpy as np
import pandas as pd
import numbers

# Exercise 2

In [None]:
# UNIT TEST normalize AND standardize
import warnings
dfIrisTest = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
AIris = dfIrisTest.values[:,0:4].astype(float)

def testNormalizationAndStandardization(A):
    n = normalize(A)
    if type(n) != np.ndarray or n.shape != A.shape:
        warnings.warn("Normalized data should be a matrix of the same shape as the input.")
        return
    print("min after normalization: " + ("OK" if all(np.isclose(np.min(n, axis=0), np.zeros(4))) else "FAIL"))
    print("max after normalization: " + ("OK" if all(np.isclose(np.max(n, axis=0), np.ones(4))) else "FAIL"))

    s = standardize(A)
    if type(s) != np.ndarray or s.shape != A.shape:
        warnings.warn("Normalized data should be a matrix of the same shape as the input.")
        return
    mean, covariances, correlations = getStats(s)
    print("mean after standardization: " + ("OK" if all(np.isclose(mean, np.zeros(4))) else "FAIL"))
    print("stds after standardization: " + ("OK" if all(np.isclose(np.diag(covariances), np.ones(4))) else "FAIL"))

testNormalizationAndStandardization(AIris)

# Exercise 3.1

In [None]:
def check_column_conversion(column):
    M = binarizeCategoricalAttributeVector(column)
    vals = list(np.unique(M))
    sorted(vals)
    print("-----------------\nBinarization check\n-----------------")
    print("Dimension check: " + ("OK" if M.shape == (len(column), len(np.unique(column))) else "FAIL"))
    print("Occurring values: " + ("OK" if vals == [0, 1] else "FAIL (there should only be 0s and 1s in the output.)"))
    print("Coherence: " + ("OK" if all(np.sum(M, axis=1) == np.ones(len(column))) else "FAIL (all rows must sum up to 1)"))

def check_category_detection(df, expectedcols):
    print("-----------------\nCheck of category detection\n-----------------")
    act = getCategoricalAttributes(df)
    missing = [c for c in expectedcols if not c in act]
    unexpected = [c for c in act if not c in expectedcols]
    print("Categorical attribute detection: " + ("OK" if len(missing) + len(unexpected) == 0 else "FAIL (undetected columns: " + str(missing) + ", wrongly detected columns: " + str(unexpected) + ")"))
    
def check_frame_conversion(df, num_expected_columns):
    print("-----------------\nConversion check for data frames\n-----------------")
    A = readFrameAsMatrix(df)
    print("Outer Type check: " + ("OK" if type(A) == np.ndarray else "FAIL (not a numpy array but " + str(type(A)) + ")"))
    print("Inner Type check: " + ("OK" if A.dtype in [float, np.float32, np.float64] else "FAIL (dtype of matrix should be something numeric like float and not " + str(A.dtype) + ")"))
    print("Dimensionality check: " + ("OK" if len(A) == len(df) and A.shape[1] == num_expected_columns else "FAIL (expected shape " + str(len(df)) + " x " + str(num_expected_columns) + ", but observed shape " + str(len(A)) + " x " + str(A.shape[1]) + ")"))


## unit test for conversion functions
dfCreditTest = pd.read_csv("credits.csv")
check_column_conversion(dfCreditTest.values[:,1])
check_category_detection(dfCreditTest, ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'class'])
check_frame_conversion(dfCreditTest, 110)

# Exercise 3.2

In [None]:
def test_discretization(column, thresholds, names, expected):
    conv = discretizeBasedOnThresholds(column, thresholds, names)
    print("Conversion test: " + ("OK" if len(conv) == len(expected) and all(conv == expected) else "FAIL (expected \"" + str(expected) +"\" but observed \"" + str(conv) + "\")"))
    
def test_equal_length_discretization(arr, k, expected):
    act = discretizeEqualLength(arr, k)
    print ("Equal Length Discretization: " + ("OK" if all(act == expected) else "FAIL"))
    
def test_equal_count_discretization(arr, k, expected):
    act = discretizeEqualFrequency(arr, k)
    print ("Equal Count Discretization: " + ("OK" if all(act == expected) else "FAIL"))

# reproduce results from the lecture
dfIrisTest = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
test_discretization(dfIris.values[:,0], [5.2, 6.1, 7], ["very short", "short", "long", "very long"], ["very short", "very short", "very short", "very short", "very short", "short", "very short", "very short", "very short", "very short", "short", "very short", "very short", "very short", "short", "short", "short", "very short", "short", "very short", "short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "short", "very short", "short", "very short", "very short", "short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "very short", "short", "very short", "long", "long", "long", "short", "long", "short", "long", "very short", "long", "very short", "very short", "short", "short", "short", "short", "long", "short", "short", "long", "short", "short", "short", "long", "short", "long", "long", "long", "long", "short", "short", "short", "short", "short", "short", "short", "short", "long", "long", "short", "short", "short", "short", "short", "very short", "short", "short", "short", "long", "very short", "short", "long", "short", "very long", "long", "long", "very long", "very short", "very long", "long", "very long", "long", "long", "long", "short", "short", "long", "long", "very long", "very long", "short", "long", "short", "very long", "long", "long", "very long", "long", "short", "long", "very long", "very long", "very long", "long", "long", "short", "very long", "long", "long", "short", "long", "long", "long", "short", "long", "long", "long", "long", "long", "long", "short"])
test_discretization(dfIris.values[:,1], [2.8, 3.6], ["short", "medium", "long"], ["medium", "medium", "medium", "medium", "medium", "long", "medium", "medium", "medium", "medium", "long", "medium", "medium", "medium", "long", "long", "long", "medium", "long", "long", "medium", "long", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "long", "long", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "short", "medium", "medium", "long", "medium", "long", "medium", "long", "medium", "medium", "medium", "medium", "short", "short", "short", "medium", "short", "medium", "short", "short", "medium", "short", "medium", "medium", "medium", "medium", "short", "short", "short", "medium", "short", "short", "short", "medium", "medium", "short", "medium", "medium", "short", "short", "short", "short", "short", "medium", "medium", "medium", "short", "medium", "short", "short", "medium", "short", "short", "short", "medium", "medium", "medium", "short", "short", "medium", "short", "medium", "medium", "medium", "medium", "short", "medium", "short", "medium", "medium", "short", "medium", "short", "short", "medium", "medium", "long", "short", "short", "medium", "short", "short", "short", "medium", "medium", "short", "medium", "short", "medium", "short", "long", "short", "short", "short", "medium", "medium", "medium", "medium", "medium", "medium", "medium", "short", "medium", "medium", "medium", "short", "medium", "medium", "medium"])
test_equal_length_discretization(dfIrisTest.values[:,0], 4, np.array(["c0", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c1", "c1", "c1", "c0", "c1", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c1", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c2", "c2", "c2", "c1", "c2", "c1", "c2", "c0", "c2", "c0", "c0", "c1", "c1", "c1", "c1", "c2", "c1", "c1", "c2", "c1", "c1", "c1", "c2", "c1", "c2", "c2", "c2", "c2", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c1", "c2", "c2", "c1", "c1", "c1", "c1", "c1", "c0", "c1", "c1", "c1", "c2", "c0", "c1", "c2", "c1", "c3", "c2", "c2", "c3", "c0", "c3", "c2", "c3", "c2", "c2", "c2", "c1", "c1", "c2", "c2", "c3", "c3", "c1", "c2", "c1", "c3", "c2", "c2", "c3", "c2", "c1", "c2", "c3", "c3", "c3", "c2", "c2", "c1", "c3", "c2", "c2", "c1", "c2", "c2", "c2", "c1", "c2", "c2", "c2", "c2", "c2", "c2", "c1"]))
test_equal_count_discretization(dfIrisTest.values[:,0], 4, np.array(["c0", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c1", "c0", "c0", "c0", "c1", "c1", "c1", "c0", "c1", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c1", "c0", "c0", "c1", "c1", "c1", "c0", "c0", "c1", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c0", "c1", "c0", "c3", "c2", "c3", "c1", "c3", "c1", "c2", "c0", "c3", "c1", "c0", "c2", "c2", "c2", "c1", "c3", "c1", "c1", "c2", "c1", "c2", "c2", "c2", "c2", "c2", "c3", "c3", "c3", "c2", "c1", "c1", "c1", "c1", "c2", "c1", "c2", "c3", "c2", "c1", "c1", "c1", "c2", "c1", "c0", "c1", "c1", "c1", "c2", "c0", "c1", "c2", "c1", "c3", "c2", "c3", "c3", "c0", "c3", "c3", "c3", "c3", "c2", "c3", "c1", "c1", "c2", "c3", "c3", "c3", "c2", "c3", "c1", "c3", "c2", "c3", "c3", "c2", "c2", "c2", "c3", "c3", "c3", "c2", "c2", "c2", "c3", "c2", "c2", "c2", "c3", "c3", "c3", "c1", "c3", "c3", "c3", "c2", "c3", "c2", "c2"]))