# Missing values

1. This notebook explores the approaches to handle missing values
2. INSEAD Chiller Plant data is first considered. However, North Point would be ideal (let's probably use it as well)

In [1]:
% matplotlib inline 

import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
import numpy as np
import glob
import os

import common

In [2]:
# load data
df = common.load_df("../data/insead", "*21*.csv")

In [29]:
# replace nulls with rolling mean or near ones
def replace_nulls(df, cols=[]):
    cols = cols or df.columns
    mean = df[cols].rolling(5, min_periods=1).mean()
    df[cols] = df[cols].fillna(mean)
    df[cols] = df[cols].fillna(method="pad")
    return df

# create binary cols. 1 if > thresh, 0 if <= thresh
def create_binary_cols(df, cols=[], thresh=0.1):
    cols = cols or df.columns
    for c in cols:
        binzr = preprocessing.Binarizer(thresh).fit(df[c])
        df[c + "_bin"] = binzr.transform(df[c])
    return df

# set values < thresh to 0
def replace_with_zero(df, cols=[], thresh=0.1):
    cols = cols or df.columns
    cols = [i for i in cols if not i.endswith("_bin")] # ignore step 2 cols
    for c in cols:
        df.loc[df[c] < thresh, c] = 0
    return df

    
# replace values < thresh with near values
def replace_with_near(df, cols=[], thresh=0.1):
    cols = cols or df.columns
    cols = [i for i in cols if not i.endswith("_bin")] # ignore step 2 cols
    for c in cols:
        df.loc[df[c] < thresh, c] = np.nan
    df[cols] = df[cols].fillna(method="pad")
    return df

        
# create ctkw total
def set_ctwk_total(df, cols=[]):
    cols = cols or df.columns
    ctkw_cols = [i for i in cols if i.startswith("ct") and i.endswith("kw")]
    df["cwkw_sum"] = df[ctkw_cols].sum(axis=1)
    return df


# remove random noise. do some soft smoothing.
def smooth_data(df, cols=[]):
    cols = cols or df.columns
    df[cols] = df[cols].rolling(10, min_periods=1).mean()

# create chunks. break when values are < thresh
def create_chunks(df, cols=[], thresh=0.1):
    cols = cols or df.columns
    cols = [i for i in cols if not i.endswith("_bin")]
    
    df = df.copy()
    df[df[cols] < thresh] = np.nan

    chunks = []
    break_len = 0
    start = 0
    end = min(0, df.shape[0]-1)
    has_null = lambda d, row: np.any(np.isnan(d.loc[row, cols].values))

    for i in range(df.shape[0]):
        if has_null(d, i):
            break_len += 1
        else:
            if break_len != 0: start = i
            break_len = 0

        if break_len == 1 or i == df.shape[0]-1:
            end = i-1
            if i == df.shape[0]-1 and not has_null(d, i):
                end = i
            chunks.append((start, end))
    return chunks

In [30]:
d = pd.DataFrame({"A": [1,2,3, np.nan, np.nan, np.nan, 4,5,6,7,np.nan,1,2,np.nan,1,2,3,np.nan,1], 
                  "B": [1,2,3, np.nan, np.nan, np.nan, 4,5,6,7,np.nan,1,2,np.nan,1,2,3,np.nan,1]})
chunks = step_7(d, thresh=2)     
print(chunks)

[(0, 2), (6, 9), (11, 12), (14, 16), (18, 18)]


In [31]:
def preprocess_approach_1(df, cols=[]):
    pipeline = [
        replace_nulls,
        create_binary_cols,
        replace_with_zero,
        set_ctwk_total,
        smooth_data,
    ]
    for func in pipeline:
        df = func(df)
    return df


def preprocess_approach_2(df, cols=[]):
    pipeline = [
        replace_nulls,
        create_binary_cols,
        replace_with_near,
        set_ctwk_total,
        smooth_data,
    ]
    for func in pipeline:
        df = func(df)
    return df