In [1]:
import csv
import re
from itertools import takewhile

def parse_arff(arff):
    """Parses arff file"""
    r_datatag = re.compile('^@data', flags=re.IGNORECASE)
    header = list(takewhile(lambda line: not r_datatag.match(line), arff))
    
    r_attrname = re.compile('^@attribute\s([\w.]+)\s.*', flags=re.IGNORECASE)
    attrnames = []
    for line in header:
        try:
            name = r_attrname.match(line).group(1)
        except AttributeError:
            continue
        name = name.replace('.', '_') # dots make fieldnames invalid
        attrnames.append(name)
        
    data = csv.reader(arff)

    return attrnames, data

In [2]:
import numpy as np
import pandas as pd

with open("CWData_train.arff") as arff:
    attrnames, data = parse_arff(arff)
    
    properties = (row[1:-1] for row in data)
    
    df = pd.DataFrame(properties, columns=attrnames[1:-1])
    df = df.replace('?', np.nan)
    df = df.astype(np.float)

In [54]:
from collections import namedtuple
from scipy.stats import shapiro, normaltest
from sklearn.preprocessing import minmax_scale

shapiro_results = pd.DataFrame(columns=['statistic', 'p_value'], dtype=np.float)
dagostino_results = pd.DataFrame(columns=['statistic', 'p_value'], dtype=np.float)

for column in df:
    present = df[[column]].dropna()
    sample = present.sample(n=100)
    standardised = minmax_scale(sample)

    s_statistic, s_p = shapiro(standardised)
    shapiro_results.loc[column] = [s_statistic, s_p]
    
    d_statistic, d_p = normaltest(standardised)
    dagostino_results.loc[column] = [d_statistic[0], d_p[0]]

In [55]:
print(shapiro_results.loc[shapiro_results['p_value'].idxmax()])
print(shapiro_results.loc[shapiro_results['p_value'].idxmin()])

statistic    0.995643
p_value      0.988342
Name: Z4_1, dtype: float64
statistic    0.899592
p_value      0.000001
Name: F5_1, dtype: float64


In [56]:
print(dagostino_results.loc[dagostino_results['p_value'].idxmax()])
print(dagostino_results.loc[dagostino_results['p_value'].idxmin()])

statistic    0.012783
p_value      0.993629
Name: Z5_1, dtype: float64
statistic    27.121017
p_value       0.000001
Name: VHSE8_1, dtype: float64


In [57]:
alpha = 0.05

shapiro_rejects = shapiro_results['p_value'] < alpha
dagostino_rejects = shapiro_results['p_value'] < alpha

print(shapiro_rejects.value_counts())
print(dagostino_rejects.value_counts())

False    48
True     18
Name: p_value, dtype: int64
False    48
True     18
Name: p_value, dtype: int64
