In [350]:
import pandas as pd

In [684]:
from tabint.transform import *

In [351]:
from tabint.utils import *

In [352]:
data = pd.read_csv('DLCO.csv', sep=";")

In [353]:
x = data[['Sex', 'Age', 'Height']]

In [358]:
x.iloc[1,1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [361]:
x.head()

Unnamed: 0,Sex,Age,Height
0,F,20.0,173.0
1,F,,165.0
2,F,22.0,168.6
3,F,23.0,164.0
4,F,24.0,170.0


In [360]:
y = data['DLCO']

# steps and transform

In [402]:
def df_from_array(ary, columns, index = None): return pd.DataFrame(ary, columns=columns, index = index)

In [680]:
class TBStep:
    def __init__(self, **kargs): pass
    
    def fit(self, **kargs): pass
    
    def transform(self, df, **kargs): pass
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)

In [677]:
class noop_step(TBStep):
    def transform(self, df): return df

In [678]:
noop_transform = TBTransform([noop_step])

In [676]:
class TBTransform:
    def __init__(self, steps):
        self.steps = steps
        
    def __repr__(self):
        return '\n'.join([str(pos) + ' - '+ str(step) for pos, step in enumerate(self.steps)])
    
    def fit(self, df):
        for step in self.steps: step.fit(df)
        self.first_transform = True
    
    def transform(self, df):
        df = df.copy()
        for step in self.steps: df = step.transform(df)
        if self.first_transform is None: self.get_features(df)

        return df
    
    def get_features(self, df):
        self.features = df.columns
        self.cons = []; self.cats = []
        for feature, value in df.items(): 
            if np.array_equal(np.sort(value.unique()), np.array([0, 1])) or np.array_equal(np.sort(value.unique()), np.array([0])): self.cats.append(feature) 
            else: self.cons.append(feature)
    
    def append(self, steps): self.steps.append(steps)
    
    def insert(self, index, steps): self.steps.insert(index, steps)
        
    def pop(self, n_pop): self.steps.pop(n_pop)

# drop features

In [642]:
class drop_features(TBStep):
    def __init__(self, features = None):
        self.features = features
        
    def __repr__(self):
        print_features = ', '.join(to_iter(self.features))
        return f'drop {print_features}'
    
    def transform(self, df): return df.drop(self.features, axis=1)

In [230]:
dr = drop_features(['a', 'b'])

In [231]:
dr

drop a, b

# select

In [643]:
class select(TBStep):
    def __init__(self, features):
        self.features = features
        
    def __repr__(self):
        print_features = ', '.join(to_iter(self.features))
        return f'select {print_features}'
    
    def transform(self, df): return df[self.features]

In [331]:
slc = select(['a', 'b'])

In [332]:
slc

select a, b

# apply function

In [484]:
def unique_list(*agrs):
    lists = []
    for agr in agrs: lists += list(agr)
    return list(set(lists))

In [485]:
unique_list(['a', 'b'], ['a', 'c'])

['b', 'c', 'a']

In [644]:
class apply_function(TBStep):
    def __init__(self, function_dict): self.function_dict = function_dict
        
    def __repr__(self): 
        keys = ', '.join(self.function_dict.keys())
        return f'apply function for {keys}'
    
    def transform(self, df): 
        df = df.copy()
        for key in self.function_dict.keys(): df[key] = self.function_dict[key](df)
        return df

In [341]:
af = apply_function({'Sex': lambda df: df['Sex'].apply(lambda x : 1 if x == 'F' else 0)})

In [342]:
af

apply function for Sex

In [344]:
af.fit(x)

# fill na

In [646]:
class fill_na(TBStep):
    def __init__(self, features = None):
        self.na_dict = {}
        self.features = features
        
    def __repr__(self):
        return 'fill na'    
    
    def fit(self, df):        
        if self.features is None: self.features = df.columns
        for feature in self.features:
            if is_numeric_dtype(df[feature].values):
                if pd.isnull(df[feature]).sum(): 
                    self.na_dict[feature] = df[feature].median()
    
    def transform(self, df):
        df = df.copy()
        for key in self.na_dict.keys(): df[key] = df[key].fillna(self.na_dict[key])
        return(df)

# remove outlier

In [93]:
from tabint.pre_processing import *
from tabint.utils import *

In [685]:
class remove_outlier(TBStep): 
    def __init__(self, features = None):
        self.features = features
        
    def __repr__(self):
        print_features = ', '.join(to_iter(self.features))
        return f'remove outlier of {print_features}'
        
    def fit(self, df):
        self.bw_dict = {}
        if self.features is None: self.features = df.columns
        for feature, value in df[self.features].items():
            if is_numeric_dtype(value):
                self.bw_dict[feature] = {}
                Min, _, _, _, Max, _ = boxnwhisker_value(value)
                self.bw_dict[feature]['Min'] = Min
                self.bw_dict[feature]['Max'] = Max
        
    def transform(self, df):
        mask =  np.full(df.shape[0], True)
        for key in self.bw_dict.keys():
            values = df[key].values
            Min = self.bw_dict[key]['Min']
            Max = self.bw_dict[key]['Max']
            inlier = np.logical_and(values >= Min, values <= Max)
            mask = np.logical_and(mask, inlier)
        self.mask = mask
        return df[mask]

In [233]:
ro = remove_outlier(['a', 'b'])

In [234]:
ro

remove outlier of a, b

# subset

In [649]:
class subset(TBStep): 
    def __init__(self, n_sample = None, ratio = 0.3):
        self.n_sample = n_sample
        self.ratio = ratio
        
    def __repr__(self): return f'select subset with {self.n_sample} samples'
    
    def fit(self, df):
        if self.n_sample is None: self.n_sample = self.ratio*df.shape[0]
        
    def transform(self, df): return df.sample(self.n_sample)

In [347]:
x.shape[0]

637

In [236]:
ss = subset(20)

In [237]:
ss

select subset of 20 samples

# app cat

In [664]:
class app_cat(TBStep):
    def __init__(self, max_n_cat=15, features = None):
        self.max_n_cat = max_n_cat
        self.features = features
        
    def __repr__(self): return f'apply category with maximum number of distinct value is {self.max_n_cat}'
    
    def fit(self, df):
        if self.features is None: self.features = df.columns
        self.app_cat_dict = {}
        for feature, value in df[self.features].items():
            if is_numeric_dtype(value) and value.dtypes != np.bool:
                if value.nunique()<=self.max_n_cat:
                    if not np.array_equal(value.unique(), np.array([0, 1])): 
                        self.app_cat_dict[feature] = self.as_category_as_order
            else:
                if value.nunique()>self.max_n_cat: self.app_cat_dict[feature] = self.as_category_as_codes
                elif value.dtypes.name == 'object': self.app_cat_dict[feature] = self.as_category_as_order
                elif value.dtypes.name == 'category': self.app_cat_dict[feature] = self.cat_as_order
    
    @staticmethod
    def cat_as_order(x): return x.cat.as_ordered()
    
    @staticmethod
    def as_category_as_codes(x): return x.astype('category').cat.codes+1
    
    @staticmethod
    def as_category_as_order(x): return x.astype('category').cat.as_ordered()
        
    def transform(self, df):
        df = df.copy()
        for key in self.app_cat_dict.keys(): df[key] = self.app_cat_dict[key](df[key])
        return df

In [243]:
ac = app_cat()

In [244]:
ac

apply category with maximum number of distinct value is 15

# dummies

In [438]:
set(['a', 'b'] + ['a', 'c'])

{'a', 'b', 'c'}

In [651]:
class dummies(TBStep):
    def __init__(self, dummy_na = True):
        self.dummy_na = dummy_na

    def __repr__(self): return 'get dummies'
        
    def transform(self, df):
        df = df.copy()
        df = pd.get_dummies(df, dummy_na=self.dummy_na)
        return df

# scale var

In [258]:
import warnings
import sklearn

In [261]:
from sklearn.exceptions import DataConversionWarning

In [652]:
class scale_vars(TBStep):
    def __init__(self, features = None):
        warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
        self.features= features
        
    def __repr__(self): return 'scale features'
    
    def fit(self, df):
        if self.features is None: self.features = df.columns        
        self.features = [i for i in self.features if is_numeric_dtype(df[i])]
        map_f = [([n],StandardScaler()) for n in df[self.features].columns]
        self.mapper = DataFrameMapper(map_f).fit(df[self.features].dropna(axis=0))
        
    def transform(self, df):
        df = df.copy()
        df[self.mapper.transformed_names_] = self.mapper.transform(df[self.features])
        return df

In [274]:
sv = scale_vars()

In [275]:
sv

scale features

In [276]:
sv.fit(x)

# test

In [665]:
steps = [apply_function({'height2': lambda df: df['Height']*2}),
        fill_na(),
         select(['Sex', 'Age', 'Height', 'height2']),
         drop_features('height2'),
         scale_vars(),
         app_cat(),
         dummies(),
         remove_outlier('Height')]

In [666]:
tfms = TBTransform(steps)

In [667]:
tfms

0 - apply function for height2
1 - fill na
2 - select Sex, Age, Height, height2
3 - drop height2
4 - scale features
5 - apply category with maximum number of distinct value is 15
6 - get dummies
7 - remove outlier of Height

In [668]:
tfms.fit(x)

In [669]:
a = tfms.transform(x)

In [670]:
tfms.features

Index(['Age', 'Height', 'Sex_F', 'Sex_M', 'Sex_nan'], dtype='object')

In [671]:
tfms.cats

['Sex_F', 'Sex_M', 'Sex_nan']

In [672]:
tfms.cons

['Age', 'Height']

In [630]:
a.Sex_nan.unique()

array([0], dtype=uint64)

In [614]:
np.sort(a.Sex_F.unique())

array([0, 1], dtype=uint64)

In [610]:
np.array_equal(a.Sex_F.unique(), np.array([0, 1]))

False

# dataset

In [539]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.utils.validation import _num_samples, check_array
from sklearn.model_selection._split import _approximate_mode, _validate_shuffle_split
from sklearn.utils import indexable, check_random_state, safe_indexing

class split_by_cats(StratifiedShuffleSplit):
    def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        if y.ndim == 2:
            # for multi-label y, map each distinct row to a string repr
            # using join because str(row) uses an ellipsis if len(row) > 1000
            y = np.array([' '.join(row.astype('str')) for row in y])

        classes, y_indices = np.unique(y, return_inverse=True)
        n_classes = classes.shape[0]

        class_counts = np.bincount(y_indices)
        if np.min(class_counts) < 2:
            print(ValueError("The least populated class in y has only 1"
                             " member, which is too few. The minimum"
                             " number of groups for any class cannot"
                             " be less than 2."))

        if n_train < n_classes:
            print(ValueError('The train_size = %d should be greater or '
                             'equal to the number of classes = %d' %
                             (n_train, n_classes)))
        if n_test < n_classes:
            print(ValueError('The test_size = %d should be greater or '
                             'equal to the number of classes = %d' %
                             (n_test, n_classes)))

        # Find the sorted list of instances for each class:
        # (np.unique above performs a sort, so code is O(n logn) already)
        class_indices = np.split(np.argsort(y_indices, kind='mergesort'),
                                 np.cumsum(class_counts)[:-1])

        rng = check_random_state(self.random_state)

        for _ in range(self.n_splits):
            # if there are ties in the class-counts, we want
            # to make sure to break them anew in each iteration
            n_i = _approximate_mode(class_counts, n_train, rng)
            class_counts_remaining = class_counts - n_i
            t_i = _approximate_mode(class_counts_remaining, n_test, rng)

            train = []
            test = []

            for i in range(n_classes):
                permutation = rng.permutation(class_counts[i])
                perm_indices_class_i = class_indices[i].take(permutation,
                                                             mode='clip')

                train.extend(perm_indices_class_i[:n_i[i]])
                test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])

            train = rng.permutation(train)
            test = rng.permutation(test)

            yield train, test

In [532]:
def split_time_series(df, time_feature, ratio):
    df = df.copy()
    df = df.sort_values(by=time_feature, ascending=True)
    split_id = int(df.shape*(1-ratio))
    x_trn, y_trn = df[:split_id], y[:split_id]
    x_val, y_val = df[split_id:], y[split_id:]
    return x_trn, y_trn, x_val, y_val

In [557]:
def stratify_split(df, y, cats, ratio):
    keys = df[cats]
    if y.dtype.name[:5] != 'float': keys = pd.concat([keys, y], axis=1)
    keys = keys.apply(lambda x: '~'.join([str(j) for j in x.values]), axis=1)

    sss = split_by_cats(train_size =1-ratio, test_size=ratio)
    train, val = next(sss.split(df, keys))                
    x_trn, x_val = safe_indexing(df, train), safe_indexing(df, val)            
    y_trn, y_val = safe_indexing(y, train), safe_indexing(y, val)
    return x_trn, y_trn, x_val, y_val

In [565]:
ds = TBDataset.from_SKSplit(x, y, cats = 'Sex')

In [566]:
ds.x_trn.shape

(509, 3)

In [567]:
ds.cons

['Age', 'Height']

In [686]:
class TBDataset:
    """
    Contain train, validation, test set
    """
    def __init__(self, x_trn, x_val, x_tst, x_tfms, y_trn, y_val, y_tfms):            
        self.x_trn, self.y_trn, self.x_tst = x_trn, y_trn, x_tst
        self.x_val, self.y_val = x_val, y_val
        self.x_tfms, self.y_tfms = x_tfms, y_tfms

    @classmethod
    def from_Split(cls, df, y = None, y_field = None, tp = '_',  
                     x_tst = None, time_feature = None, ratio = 0.2, 
                     x_tfms = None, y_tfms = None, **kargs):
        """
        use sklearn split function to split data
        """
        df = df.copy()
        
        if y is None: y = df[y_field]; df = df.drop(y_field, axis = 1)
            
        if tp != 'time series': x_trn, y_trn, x_val, y_val = stratify_split(df, y, x_tfms.cats, ratio)
        else: x_trn, y_trn, x_val, y_val = split_time_series(df, time_feature, ratio)
        
        x_trn, x_val, x_tst, y_trn, y_val, x_tfms, y_tfms = transform_data(x_trn, x_val, x_tst, y_trn, y_val, x_tfms, y_tfms)
        return cls(x_trn, x_val, x_tst, x_tfms, y_trn, y_val, y_tfms)
    
    @staticmethod
    def transform_data(x_trn, x_val, x_tst, y_trn, y_val, x_tfms, y_tfms):
        if x_tfms is not None: x_tfms = noop_transform
        x_tfms.fit(x_trn)
        x_trn = x_tfms.transform(x_trn)
        x_val = x_tfms.transform(x_val)
        if x_tst is not None: x_tst = x_tfms.transform(x_tst)
                
        if y_tfms is not None: y_tfms = noop_transform
        y_tfms.fit(y_trn)
        y_trn = y_tfms.transform(y_trn)
        y_val = y_tfms.transform(y_val)
            
        return x_trn, x_val, x_tst, y_trn, y_val, x_tfms, y_tfms         
            
    def val_permutation(self, features):
        """"
        permute one or many columns of validation set. For permutation importance
        """
        features = to_iter(features)
        df = self.x_val.copy()
        for ft in features: df[ft] = np.random.permutation(df[ft])
        return df

    def apply_function(self, feature, function_dict, inplace = True, tp = 'trn'):
        """
        apply a function f for all dataset
        """
        features = to_iter(features)
        step = apply_function(function_dict).fit(self.x_trn)
        self.apply_step(step, features, inplace, tp)

    def sample(self, tp = 'trn', ratio = 0.3):
        """
        get sample of dataset
        """
        if 'tst' == tp: 
            return None if self.x_tst is None else self.x_tst.sample(self.x_tst.shape[0]*ratio)
        else:
            df, y = (self.x_trn, self.y_trn) if tp == 'trn' else (self.x_val, self.y_val)
            _, df, _, y = train_test_split(df, y, test_size = ratio, stratify = y)
            return df, y

    def select(self, features, inplace = True, tp = 'trn'):
        """
        keep columns of dataset
        """
        features = to_iter(features)
        step = select(features).fit(self.x_trn)
        self.apply_step(step, features, inplace, tp)
        
    def drop(self, feature, inplace = True, tp = 'trn'):
        """
        drop columns of dataset
        """
        features = to_iter(features)
        step = drop_features(features).fit(self.x_trn)
        self.apply_step(step, features, inplace, tp)
            
    def remove_outlier(self, features = None, inplace = True, tp = 'trn'):
        features = features or self.cons
        features = to_iter(features)
        mask_trn = self.get_mask_outlier(self.x_trn, features)
        mask_val = self.get_mask_outlier(self.x_val, features)
        if inplace:
            self.x_trn, self.y_trn = self.x_trn[mask_trn], self.y_trn[mask_trn]
            self.x_val, self.y_val = self.x_val[mask_val], self.y_val[mask_val]
        else:
            return (self.x_trn[mask_trn], self.y_trn[mask_trn]) if tp == 'trn' else (self.x_val[mask_val], self.y_val[mask_val])
    
    def get_mask_outlier(self, df, features):
        step = remove_outlier(features)
        step.fit(df)
        _ = step.transform(df)
        mask = step.mask
        return mask
    
    
    def apply_step(self, step, features, inplace, tp):
        if inplace:
            x_tfms.append(step)
            self.x_trn = step.transform(self.x_trn)
            self.x_val = step.transform(self.x_val)
            if self.x_tst is not None: self.x_tst = step.transform(self.x_tst)
            x_tfms.get_features(self.x_trn)    
        else:
            if tp == 'tst': return None if self.x_tst is None else step.transform(self.x_tst)
            else: return (step.transform(self.x_trn), self.y_trn) if tp == 'trn' else (step.transform(self.x_val), self.y_val)    
    
    @property
    def cons(self): return self.x_tfms.cons
    
    @property
    def cats(self): return self.x_tfms.cats
    
    @property
    def features(self): return self.x_trn.columns

    @property
    def trn(self): return self.x_trn, self.y_trn

    @property
    def n_trn(self): return self.x_trn.shape[0]

    @property
    def val(self): return self.x_val, self.y_val

    @property
    def n_val(self): return self.x_val.shape[0]