In [3]:
# Core DS libraries
import numpy as np
import pandas as pd

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

import mytk # My Toolkit
import wrangle # Helper functions

- Detect if column is numerical or categorical
    - Allow manual overriding

In [4]:
df = wrangle.wrangle_potatoes()[0]

In [5]:
df.columns

Index(['product_code', 'loading', 'attribute_0', 'attribute_1', 'attribute_2',
       'attribute_3', 'measurement_0', 'measurement_1', 'measurement_2',
       'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6',
       'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10',
       'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14',
       'measurement_15', 'measurement_16', 'measurement_17', 'failure'],
      dtype='object')

In [6]:
l = df.select_dtypes(include=['object','category','bool']).columns.tolist()
l

['product_code', 'attribute_0', 'attribute_1']

In [7]:
def get_column_types(df, override_categorical=[], override_numerical=[]):

    cat_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()
    num_cols = df.select_dtypes(exclude=['object','category','bool']).columns.tolist()

    for val in override_categorical:
        if val in num_cols:
            num_cols.remove(val)
            cat_cols.append(val)
    for val in override_numerical:
        if val in cat_cols:
            cat_cols.remove(val)
            num_cols.append(val)
            
    # cat_cols.sort()
    # num_cols.sort()
    out = {
        'cat': cat_cols,
        'num': num_cols
    }
    return out

In [8]:
gotchas = mytk.get_gotchas(df)
gotchas['possible_bools'] + gotchas['probable_categories']

['attribute_0',
 'failure',
 'product_code',
 'attribute_0',
 'attribute_1',
 'attribute_2',
 'attribute_3',
 'failure']

In [9]:
get_column_types(df, override_categorical=gotchas['possible_bools'] + gotchas['probable_categories'])

{'cat': ['product_code',
  'attribute_0',
  'attribute_1',
  'failure',
  'attribute_2',
  'attribute_3'],
 'num': ['loading',
  'measurement_0',
  'measurement_1',
  'measurement_2',
  'measurement_3',
  'measurement_4',
  'measurement_5',
  'measurement_6',
  'measurement_7',
  'measurement_8',
  'measurement_9',
  'measurement_10',
  'measurement_11',
  'measurement_12',
  'measurement_13',
  'measurement_14',
  'measurement_15',
  'measurement_16',
  'measurement_17']}

In [10]:
len(df.columns)

25

In [11]:
import json
class CustomJSONizer(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.bool_):
            return super().encode(bool(obj))
        
        elif isinstance(obj, np.ndarray):
            return super().encode(str(obj))
        
        elif isinstance(obj, pd.DataFrame):
            return super().encode(obj.to_dict())

        else:
            return super().default(obj)

def prettify(obj):
    return json.dumps(obj, cls=CustomJSONizer, indent=2) 

# Categorical Columns

- If column is categorical.
    - Chi2 against each other categorical
    - ANOVA against each numerical
        - If Reject
            - 1-sample ttest each categorical subset vs overall mean.

In [12]:
coltype = get_column_types(df, override_categorical=gotchas['possible_bools'] + gotchas['probable_categories'])

out = {}
for col in coltype['cat']:
    cold = out[col] = {}
    
    this = cold['chi2'] = {}
    for target in coltype['cat']:
        if target != col:
            this[target] = mytk.chi2_test(df[col], df[target])
    
    this = cold['anova'] = {}
    for target in coltype['num']:
        if target != col:
            anova = this[target] = mytk.anova_variance_in_target_for_cat(df, target, col)
            if anova['reject'] == True:
                anova['ttest'] = mytk.ttest_target_for_each_cat(df, target, col)

print(prettify(out['failure']))

{
  "chi2": {
    "product_code": {
      "reject": "true",
      "h0": "The two samples are independent.",
      "stat_name": "Chi2",
      "stat": 10.064933483868586,
      "p_value": 0.03934797618467425,
      "alpha": 0.05
    },
    "attribute_0": {
      "reject": "false",
      "h0": "The two samples are independent.",
      "stat_name": "Chi2",
      "stat": 1.679258870484121,
      "p_value": 0.19502296029718377,
      "alpha": 0.05
    },
    "attribute_1": {
      "reject": "false",
      "h0": "The two samples are independent.",
      "stat_name": "Chi2",
      "stat": 5.191655190674597,
      "p_value": 0.07458412505140172,
      "alpha": 0.05
    },
    "attribute_2": {
      "reject": "true",
      "h0": "The two samples are independent.",
      "stat_name": "Chi2",
      "stat": 9.251887369081732,
      "p_value": 0.026122760097579713,
      "alpha": 0.05
    },
    "attribute_3": {
      "reject": "true",
      "h0": "The two samples are independent.",
      "stat_name

# Numerical Columns

- If column is numerical
    - for each cat column
        - Run anova /ttest combo
    - for each num column
        - Spearmans correlation

In [13]:
def spearman_correllation_test(df, x, y, alpha=0.05):
    from scipy.stats import spearmanr

    stat, p = spearmanr(df[x], df[y])
    result={'reject': p < alpha,
        'h0' : f"The samples of '{x}' and '{y}' are independant",
        'stat_name': 'correlation',
        'stat': stat,
        'p_value': p,
        'alpha': alpha
    }
    return result

In [14]:
spearman_correllation_test(df, 'measurement_17', 'measurement_8')

{'reject': True,
 'h0': "The samples of 'measurement_17' and 'measurement_8' are independant",
 'stat_name': 'correlation',
 'stat': 0.433931869077272,
 'p_value': 0.0,
 'alpha': 0.05}

In [15]:
for col in coltype['num']:
    cold = out[col] = {}
    
    this = cold['anova'] = {}
    for target in coltype['cat']:
        if target != col:
            anova = this[target] = mytk.anova_variance_in_target_for_cat(df, col, target)
            if anova['reject'] == True:
                anova['ttest'] = mytk.ttest_target_for_each_cat(df, col, target)

    this = cold['spearmanr'] = {}
    for target in coltype['num']:
        if target != col:
            this[target] = spearman_correllation_test(df, target, col)

print(prettify(out))

{
  "product_code": {
    "chi2": {
      "attribute_0": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 15942.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "attribute_1": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 31884.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "failure": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 10.064933483868586,
        "p_value": 0.03934797618467425,
        "alpha": 0.05
      },
      "attribute_2": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 47826.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "attribute_3": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "s

In [44]:
def all_the_stats(df, override_categorical=[], override_numerical=[]):
    # Initialize the dictionary that will be iteratively built
    out = {}
    # Separate columns into categorical and numerical
    coltype = get_column_types(df, override_categorical=override_categorical)
    
    # Loop through each categorical column
    for col in coltype['cat']:
        if len(df[col].value_counts()) > 1:
            cold = out[col] = {}
            
            # Run a chi2 test on every other categorical column
            this = cold['chi2'] = {}
            for target in coltype['cat']:
                if len(df[target].value_counts()) > 1:
                    if target != col:
                        this[target] = mytk.chi2_test(df[col], df[target])
            # Run an anova test on every numerical column
            this = cold['anova'] = {}
            for target in coltype['num']:
                if target != col:
                    anova = this[target] = mytk.anova_variance_in_target_for_cat(df, target, col)
                    # If we reject the null run a ttest to determine which categories are significant
                    if anova['reject'] == True:
                        anova['ttest'] = mytk.ttest_target_for_each_cat(df, target, col)

    # The loop through each numerical column
    for col in coltype['num']:
        
        cold = out[col] = {}
        
        # Repeat the Anova tests on each categorical column for readability
        this = cold['anova'] = {}
        for target in coltype['cat']:
            if len(df[target].value_counts()) > 1:
                if target != col:
                    anova = this[target] = mytk.anova_variance_in_target_for_cat(df, col, target)
                    if anova['reject'] == True:
                        anova['ttest'] = mytk.ttest_target_for_each_cat(df, col, target)

        # Run a correlation test for every other numerical column
        this = cold['spearmanr'] = {}
        for target in coltype['num']:
            if target != col:
                this[target] = mytk.spearman_correllation_test(df, target, col)

    return out

In [19]:
get_column_types(df, override_categorical=gotchas['possible_bools'] + gotchas['probable_categories'])

{'cat': ['product_code',
  'attribute_0',
  'attribute_1',
  'failure',
  'attribute_2',
  'attribute_3'],
 'num': ['loading',
  'measurement_0',
  'measurement_1',
  'measurement_2',
  'measurement_3',
  'measurement_4',
  'measurement_5',
  'measurement_6',
  'measurement_7',
  'measurement_8',
  'measurement_9',
  'measurement_10',
  'measurement_11',
  'measurement_12',
  'measurement_13',
  'measurement_14',
  'measurement_15',
  'measurement_16',
  'measurement_17']}

In [37]:
results = all_the_stats(df, override_categorical=gotchas['possible_bools'] + gotchas['probable_categories'])

print(prettify(results))

{
  "product_code": {
    "chi2": {
      "attribute_0": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 15942.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "attribute_1": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 31884.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "failure": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 10.064933483868586,
        "p_value": 0.03934797618467425,
        "alpha": 0.05
      },
      "attribute_2": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "stat_name": "Chi2",
        "stat": 47826.0,
        "p_value": 0.0,
        "alpha": 0.05
      },
      "attribute_3": {
        "reject": "true",
        "h0": "The two samples are independent.",
        "s

In [39]:
prod = {}
for code in df.product_code.unique().tolist():
    prod[code] = df[df.product_code == code].copy()

In [41]:
prod['A'].product_code.value_counts()

A    3072
Name: product_code, dtype: int64

In [59]:
results = all_the_stats(prod['A'], override_categorical=gotchas['possible_bools'] + gotchas['probable_categories'])


In [63]:
def pop_unrejected(results):
    for column, tests in results.items():
        for test, targets in tests.items():
            to_pop = []
            for target, result in targets.items():
                if result['reject'] == False:
                    to_pop.append(target)
            for target in to_pop:
                del targets[target]
    return results

In [62]:

print(prettify(results))

{
  "failure": {
    "chi2": {},
    "anova": {
      "loading": {
        "reject": "true",
        "h0": "There is no variance in loading between subsets of failure",
        "stat_name": "F",
        "stat": 41.92073593296491,
        "p_value": 1.1026931124492622e-10,
        "alpha": 0.05,
        "ttest": {
          "0.0": {
            "reject": "true",
            "h0": "The mean of loading for failure:0.0 is the same as the overall population",
            "stat_name": "F",
            "stat": -3.1931629074141643,
            "p_value": 0.001425841545054924,
            "alpha": 0.05
          },
          "1.0": {
            "reject": "true",
            "h0": "The mean of loading for failure:1.0 is the same as the overall population",
            "stat_name": "F",
            "stat": 5.365424335173577,
            "p_value": 1.0897411854898782e-07,
            "alpha": 0.05
          }
        }
      },
      "measurement_12": {
        "reject": "true",
        "h0": "Th