# Znovupoužitie - Pipeline

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import json
import scipy.stats as stats
import math

import category_encoders as ce

import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms

import vizualizacia_funkcie as visual

from sklearn.experimental import enable_iterative_imputer 
from sklearn import impute 
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import base
from sklearn import compose
from sklearn import feature_selection

from datetime import datetime
from datetime import date

import imblearn

In [29]:
df1 = pd.read_csv("./data/personal_train.csv", index_col=0)
df2 = pd.read_csv("./data/other_train.csv", index_col=0)

Tu nizsie mame funkcie, ktore boli deklarovane a pouzite v preprocessing etape. Dovodom, preco kopirujeme vsetky tieto funkcie do tohto notebooku je kvoli prehladnosti, ako aj kvoli tomu, ze potom dany notebook upravime na script, ktory nasledne pouzijeme v dalsej etape.

# Joining a merging dat

In [30]:
#funckia, ktora mergne zaznamy, ktore su rovnake
def piece_datarows_together(data):
    
    data = data.copy().set_index("name")
    
    #toto nam vrati dataset, ktory obsahuje vsetky duplikaty, s ktorymi budeme pracovat
    #proste to vrati data, ktore maju index, ktory je v datasete viac ako raz pouzity
    duplicated = data[data.index.duplicated(keep=False)]
    
    index_values = duplicated.index.unique()
    
    #najprv vsetky hodnoty prenesieme do prveho vyskytu zaznamu daneho pacienta v datasete
    for idx in index_values:
        mini_dataset = duplicated.loc[idx] #toto vrati viacero zaznamov s rovnakych idx
        
        #zistim si, ktore atributy su nullove pre presne prvy zaznam a pre konkretne nullove atributy budem nadalej hladat
        #nenullovu hodnotu v ostatnych zaznamoch s rovnakym idx
        missing_mask = mini_dataset.iloc[0].isnull()
        attributes = mini_dataset.columns.values
        missing_attributes = attributes[missing_mask]
        
        #tu replacujem null hodnoty za nenullove
        for attr in missing_attributes:
            not_null = mini_dataset[attr][mini_dataset[attr].notnull()]
            
            if len(not_null) != 0:
                mini_dataset.iloc[0][attr] = not_null.values[0]
        
        
    #teraz uz mozme vymazat vsetky druhe, resp. ostatne zaznamy pacienta
    duplicated_mask = data.index.duplicated(keep="first")
    
    data = data.reset_index()
    duplicated_indices = data.index.values[duplicated_mask]
    
    
    return data.drop(index=duplicated_indices).reset_index(drop=True)


In [31]:
#funkcia, ktora joine obi dva dataframy, s ktorymi pracujeme + mergne riadky, kde su splittnute data
def one_proper_df(df1, df2, return_X_y=True):
    data = df1.drop(columns=["address"]).set_index("name").join(df2.set_index("name"), how="right").reset_index()
    data = piece_datarows_together(data)
    
    if return_X_y == True:
        X = data.drop(columns=["class"])
        y = data["class"]
        return X,y
    
    else:
        return data 
    

Tu nizsie mame funkcie, ktore pouzivame na zmensenie poctu hodnot kategorickych atributov. Vyber atributov, ktore sa merguju, sme vybrali este pocas fazy analyzy, kedy malo pocetne hodnoty su mergnute do jednej hodnoty, aby hodnoty daneho atributu boli viac vyrovnane.

# Prvotne preprocessing kroky - cez FunctionTransformer

In [32]:
def marital_status_categories(row):
    
    ms = row["marital-status"]
        
    if ms is not np.nan and ms not in ("Divorced", "Never-married", "Married-civ-spouse"):
        row["marital-status"] = "Other"
        
    return row

def relationship_categories(row):
    
    rel = row["relationship"]
        
    if rel is not np.nan and rel not in ("Not-in-family", "Husband", "Own-child"):
        row["relationship"] = "Other"
        
    return row

def occupation_categories(row):

    occ = row["occupation"]
    
    if occ is not np.nan and occ not in ("Craft-repair", "Prof-specialty", "Exec-managerial", 
                                         "Adm-clerical", "Sales", "Other-service", "Machine-op-inspct", 
                                         "Transport-moving"):
        
        row["occupation"] = "Other"
        
    return row

def workclass_categories(row):

    wc = row["workclass"]
    
    if wc is not np.nan and wc != "Private":
        row["workclass"] = "Non-private"
        
    return row

#oproti ostatnym funkciam v tejto bunke, tato funkcia sluzi na transformaciu spojiteho atributu hours-per-week na kategoricky
def categorize_hours(row):
    
    hour = row["hours-per-week"]
    
    if math.isnan(hour):
        row["hours-per-week-cat"] = math.nan
    elif hour <= 35:
        row["hours-per-week-cat"] = "<=35"
    elif hour <= 45:
        row["hours-per-week-cat"] = "35< hours <=45"
    elif hour > 45:
        row["hours-per-week-cat"] = ">45"        

    return row

def simplify_education(row):
        
    edu = row["education"]
        
    if edu is np.nan:
        row["simple-edu"] = edu
        
    elif re.match("^([0-9][a-zA-Z])|(1[0-2][a-zA-Z])", edu) or edu == "Preschool":
        row["simple-edu"] = "Attending-school"
        
    elif edu in ["Assoc-acdm", "Assoc-voc", "Prof-school"]:
        row["simple-edu"] = "Edu after HS, no uni"
        
    elif edu in ["Masters", "Doctorate"]:
        row["simple-edu"] = "Masters/Doctorate"
        
    else:
        row["simple-edu"] = row["education"]
    
    return row

Tu su nejake cary-mary, kedy s atributu date_of_birth, chceme ziskat rok narodenia, ktory nasledne mozeme pouzit na imputaciu missing values, ci zlych hodnot atributu age - totiz age ma v sebe zle namerane hodnoty, ktore su bud zaporne, alebo velmi velke (v tisickach), a tak dane zle hodnoty rovno nastavime na np.nan, pricom ich nasledne imputujeme pomocou roku narodenia, co, ako som uz napisal, ziskavame pomocou tejto funkcie.

In [33]:
def date_formatting(data):    
    
    data = data.copy()
    
    import re
    dates = []

    for index,row in data.iterrows():
        dates.append(re.sub('\d', '*',  row['date_of_birth']))

    dates = list(set(dates))
    dates

    from datetime import datetime

    for index,row in data.iterrows():
        line = row['date_of_birth']
        if re.match(r"^\d{2}-\d{2}-\d{2}$", line):
            regex1 = line[0:2]
            regex2 = line[3:5]
            regex3 = line[6:8]

            verbose = False
            if (verbose == True):
                if (int(regex1) > 31):
                    print('Prvy udaj > 31: ',regex1)
                if (int(regex2) > 31):
                    print('Druhy udaj > 31: ',regex2)
                if (int(regex3) > 31):
                    print('Treti udaj > 31: ',regex3)

    data['date_of_birth'] = data['date_of_birth'].map(lambda x: x[:10])
    
    for index,row in data.iterrows():
        line = row['date_of_birth']
        dateObj = None
        if re.match(r"^\d{2}-", line):
            newDate = '19' + line
            dateObj = datetime.strptime(newDate,'%Y-%m-%d')
        elif re.match(r"^\d{4}-", line):
            dateObj = datetime.strptime(line,'%Y-%m-%d')
        elif re.match(r"^\d{4}/", line):
            dateObj = datetime.strptime(line,'%Y/%m/%d')
        elif re.match(r"^\d{2}/", line):
            dateObj = datetime.strptime(line,'%d/%m/%Y')
        data.at[index,'date_of_birth'] = dateObj.strftime('%d-%m-%Y')
    
    return data


Tu nizsie mame rozne funckie, ktore aplikujeme v prvotnej faze pipelinu, kedy pouzivame triedu preprocessing.FunctionTransformer, ktory dovoluje aplikovanie custom funkcie na nas dataset. Teda pocas tejto prvotnej fazy aplikujeme na dataset jednoduche operacie, ktore opravuju nejake atributy, ako napriklad odstranovanie white spacov, ci ziskanie novych atributov z atributu medical_info, alebo odstranovanie useless atributov, vid nizsie.

In [34]:
def remove_useless_features(X):
    
    X = X.copy()
    
    useless_cols = ["name", "race", "pregnant", "capital-loss", "capital-gain", "fnlwgt", "native-country", "address"]
    
    return X.drop(columns=useless_cols)

def add_oxygen_features(X):
    X = X.copy()
    
    X["mean_oxygen"] = 0
    X["std_oxygen"] = 0
    X["kurtosis_oxygen"] = 0
    X["skewness_oxygen"] = 0
    
    X = X.apply(get_oxygen_stats, axis=1)
    
    return X.drop(columns=["medical_info"])

#ziskavam 4 atributy o kysliku z atributu medical_info
def get_oxygen_stats(row):
    
    string = row["medical_info"]
    
    if string is np.nan:
        return row
    
    string = string.replace("\'", "\"")
    di = json.loads(string)
    
    for k in di.keys():
        row[k] = float(di[k])
        
    return row

def string_wrap_formatting(X):
    X = X.copy()
    return X.apply(string_formatting, axis=0)

#vymazem white spacy a "?" vymenim za np.nan pre vsetky atributy typu "O" - object - string
def string_formatting(col):
    
    if col.dtype == "O":
        col = col.apply(lambda row: row.strip() if row is not np.nan else row)
        col = col.apply(lambda row: np.nan if row is not np.nan and row == "?" else row)
    
    return col

#tato funkcia je wrapper, ktory aplikuje funkcie na zmensenie poctu hodnot kategorickych atributov
def bucket_cat_attr(X):
   
    X = X.copy()
    
    X = X.apply(marital_status_categories, axis=1)
    X = X.apply(relationship_categories, axis=1)
    X = X.apply(occupation_categories, axis=1)
    X = X.apply(workclass_categories, axis=1)
    
    X["hours-per-week-cat"] = 0
    X = X.apply(categorize_hours, axis=1)
    X = X.drop(columns=["hours-per-week"])
    
    return X

#atribut mean_glucose, je potrebne pretypovat
def repair_mean_glucose(X):
    
    X = X.copy()
    X["mean_glucose"] = pd.to_numeric(X['mean_glucose'], errors= 'coerce')
    return X

#tu najprv extraktujeme rok narodenia z atributu date_of_birth, nasledne nullujeme zle hodnoty v atribute age, a rovno aj
#imputujeme hodnoty v atribute age pomocou extrahovanych rokou narodenia
def prepare_age(X):
    X = X.copy()
    X = date_formatting(X)
    
    X = X.apply(make_bs_age_nan, axis=1)
    X = X.apply(calculate_age, axis=1)
    
    X = X.drop(columns=["date_of_birth"])
    
    return X
    
#zle hodnoty agu -> np.nan
def make_bs_age_nan(row):
    
    age = row["age"]
    
    if age is np.nan:
        return row
    
    if age <= 0 or age >= 100:
        row["age"] = np.nan
        
    return row

#imputovanie hodnot agu pomocou roku narodenia
def calculate_age(row):
    
    if row["age"] is np.nan or math.isnan(row["age"]):
    
        born = row["date_of_birth"]

        born = datetime.strptime(born, "%d-%m-%Y").date()
        today = date.today()
        
        row["age"] = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
 
        
    return row

Takto vyzera prvy krok pipelinu. Chcel som prvotne vnorit tento pipeline do hlavneho pipelinu, ale asi imblearn pipeline to neumoznuje, lebo sa mu to nepacilo, takze tieto kroky su rozpisane v hlavnom pipeline. Avsak tu je ten pipeline zobrazeny len pre vizualizacne ucely.

Dovodom, preco pouzivame imblearn je aplikacia resamplingu - konkretne odstranovanie outlierov pocas pipelinu. To klasicky scikit pipeline nedokaze.

In [35]:
pipeline.Pipeline(steps=[
    ("feature_removal", preprocessing.FunctionTransformer(remove_useless_features)),
    ("add_oxygen_attr", preprocessing.FunctionTransformer(add_oxygen_features)),
    ("mean_glucose_to_num", preprocessing.FunctionTransformer(repair_mean_glucose)),
    ("string_formatting", preprocessing.FunctionTransformer(string_wrap_formatting)),
    ("bucket_cat_attr", preprocessing.FunctionTransformer(bucket_cat_attr))
])
    

Pipeline(steps=[('feature_removal',
                 FunctionTransformer(func=<function remove_useless_features at 0x0000018736D389D0>)),
                ('add_oxygen_attr',
                 FunctionTransformer(func=<function add_oxygen_features at 0x0000018736D383A0>)),
                ('mean_glucose_to_num',
                 FunctionTransformer(func=<function repair_mean_glucose at 0x0000018736D380D0>)),
                ('string_formatting',
                 FunctionTransformer(func=<function string_wrap_formatting at 0x0000018736CFB160>)),
                ('bucket_cat_attr',
                 FunctionTransformer(func=<function bucket_cat_attr at 0x0000018736D38550>))])

# Imputing

In [36]:
#tato funkcia je ekvivalentna s hociktorym inym transformatorom v scikit-learne, ci uz na imputaciu, transformaciu, scaling a ine veci
#jediny rozdiel je, ze to nevrati numpy array, ale DataFrame, a vdaka tomu si uchvoavam nazvy stlpcov
#toto je klucove, pokial chcem pouzivat napriklad ColumnTransformer v dalsom kroku pipelinu, pokial chcem referovat jednotlive atributy
#dataframu na zaklade ich mena
class KeepDataFrame(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, transformation):
        self.transformation = transformation
    
    def fit(self, X, y=None):
        
        if self.transformation is not None:
            self.transformation.fit(X)
        return self
    
    def transform(self, X):
        
        if self.transformation is not None:
        
            X = X.copy()
            cols = X.columns
            indices = X.index

            X = self.transformation.transform(X)

            X = pd.DataFrame(X, columns=cols, index=indices)
        
        return X

In [37]:
#Toto je custom transformator, ktory sluzi na imputaciu kategorickych atributov prostrednictvom bud
#knn imputera alebo iterative imputera
#Najprv sa kategoricky atribut pretypuje na ciselny pomocou ce.OrdinalEncoder, nasledne dojde k imputacii
#a potom sa znova inverznou funkciou vrati do kategorickeho atributu.
#v tejto faze, kedy to uz pouzivame v pipeline, to nie je az take klucove transformovat naspat do kat. atributu
#no vyuzili sme funkciu, ktoru sme predtym vytvorili, kedze sme nasledne imputovany atribut znova analyzovali,
#co bol aj dovod, preco sme pouzili nan inverznu transformaciu
class CustomCatImputing(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, imputer_type="knn"):
        self.ordinal_encoder = None
        self.imputer = None
        self.imputer_type = imputer_type
        
    def fit(self, X, y=None):
        
        X = X.copy()
        
        columns = X.columns.values
        indices = X.index

        #toto sme uz riesili v preprocessing notebooku - chceme, aby nam null hodnoty neinkrementovali encoding hodnoty v strede datasetu,
        #ale aby sme mali urcity range celociselnych hodnot, bez dier, ktore sa pouzije v imputerovi
        #je to klucove aj pri KNN imputerovi, aj pri Iterative imputerovi, lebo pri iterative pracujeme so ciselnymi hodnotami,
        #ktore su kludne aj desatinne, a teda nakoniec sa vysledok imputera rounduje
        #a pri knn sice pracujeme s celocislenymi cislami, no nakoniec imputuje sa priemer ziskany z danych
        #n-susedov, co znova moze byt desatinne cislo
        #takze, aby sme nahodou pri roundovani sa nedostali na encoding hodnotu, ktora patri null hodnote, tak 
        #feedujeme danemu ordinal encodingu hned na zaciatku null hodnoty
        null_values = pd.DataFrame(index=pd.Index([-1]), columns=columns, data=[[np.nan for i in range(len(columns))]])
        X = pd.concat([null_values,X])

        self.ordinal_encoder = ce.ordinal.OrdinalEncoder(handle_missing="return_nan", handle_unknown="return_nan")
        X = self.ordinal_encoder.fit_transform(X)
        
        X = X[1:]
        
        if self.imputer_type == "knn":
            self.imputer = impute.KNNImputer()
            X = self.imputer.fit(X)
        
        elif self.imputer_type == "iterative":

            self.imputer = impute.IterativeImputer(max_iter=20, random_state=42, initial_strategy="most_frequent", 
                                                  min_value=X.min(), max_value=X.max())


            try:
                X = self.imputer.fit(X)
            except (ValueError, np.linalg.LinAlgError):
                print("Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi", \
                  "len prvy krat, a potom to uz ide...")
                X = self.imputer.fit(X)
            
        return self
               

    def transform(self, X):
  
        X = X.copy()
        
        indices = X.index
        columns = X.columns
    
        X = self.ordinal_encoder.transform(X)
        X = self.imputer.transform(X).round()
        
        X = pd.DataFrame(data=X, columns=columns, index=indices)
        
        X = self.ordinal_encoder.inverse_transform(X)
        
        return X
    

In [38]:
#ColumnTransformer je sice fajn, ze dokaze konkretne transformacie aplikovat na nami vybrane atributy, no vysledkom daneho 
#ColumnTransformer triedy je numpy array, nie dataframe, co je zle, pokial chceme napriklad viackrat pouzivat 
#ColumnTransformer a podobne.

#Takze tato trieda sluzi ako wrapper okolo ColumnTransformer transformacie, kedy si uchovavame strukturu dataframu, teda
#index, ako aj mena stlpcov, a nasledne, potom, co sa vykona ColumnTransformer, dany output vlozime do dataframu
class WrapColumnTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, column_transformer, keep_original_cols=True, custom_cols_names=None):
        self.column_transformer = column_transformer
        self.keep_original_cols = keep_original_cols
        self.custom_cols_names = custom_cols_names
    
    def fit(self, X, y=None):
        self.column_transformer.fit(X)
        return self
        
            
    def transform(self, X):
        indices = X.index
        
        columns = []
        
        for transf in self.column_transformer.transformers:
            columns += transf[2]
           

        X = X.copy()
        
        X = self.column_transformer.transform(X)

        if self.keep_original_cols == True:
            X = pd.DataFrame(X, columns=columns, index=indices)
        
        elif self.custom_cols_names is not None:
            X = pd.DataFrame(X, columns=self.custom_cols_names, index=indices)
            
        else:
            X = pd.DataFrame(X, index=indices)
        
        return X

Tu uz mame jednotlive skupiny atributov, pre ktore patria rozlicne sposoby aplikacie imputovania missing values.

Konkretne pouzivame: 
- IterativeImputer pre medicinske data
- Nas custom imputer, ktory pouziva KNNImputer pre atributy vztahov, prace, ci education (Dolezite je si vsimnut, ze sa nespracuvavaju spolu, ale oddelene to je zamerne)
- SimpleImuter pre sex a age (znova je potrebne to na 2krat definovat, lebo jedno je spojity atribut, druhe kategoricky) 

Dovodom preco pouzivame imputer aj pri agu, i ked sme uz pouzili rok narodenia na imputaciu daneho atributu, je v pripade, kedy mame zaznam, kde je aj missing value, resp. zly value pre atribut age, ako aj je missing value pre atribut date_of_birth

In [39]:
oxygen_attr = ["mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen"]
glucose_attr = ["mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose"]

vztahy_attr = ["relationship", "marital-status"]
work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"]
edu_attr = ["education", "education-num"]

impute_col_transf = compose.ColumnTransformer(transformers=[
    ("oxygen_n_glucose_impute", KeepDataFrame(impute.IterativeImputer(max_iter=50)), oxygen_attr + glucose_attr),
    ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr),
    ("work_impute", CustomCatImputing(imputer_type="knn"), work_attr),
    ("edu_impute", CustomCatImputing(imputer_type="knn"), edu_attr),
    ("sex_impute", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), ["sex"]),
    ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"])
])

most_freq_attr = ["sex"] + edu_attr + work_attr + vztahy_attr
mean_attr = ["age"] + oxygen_attr + glucose_attr

simple_impute_col_transf = compose.ColumnTransformer(transformers=[
    ("simple_impute_cat", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), most_freq_attr),
    ("simple_impute_num", KeepDataFrame(impute.SimpleImputer()), mean_attr)
])

# NonLinearTransf

Tu aplikujeme non-linear transformacie na spojite atributy, podla toho, ktore transformacie najlepsie, v kombinacii s odstranovanim outlierov, zlepsili korelaciu danych atributov ku target atributu. Vyber transformacii bol vykonani v preprocessing notebooku, kde sme skusali jednotlive kombinacie non-linear transformacii s odstranovanim outlierov hladajuc optimalnu konfiguraciu...

In [40]:
#sluzi na vratenie casti datasetu, ktory je outliermi
def identify_outliers(a):
    q25 = a.quantile(0.25)
    q75 = a.quantile(0.75)
    
    iqr = q75-q25
        
    lower = q25 - 1.5 * iqr
    upper = q75 + 1.5 * iqr
    
    return a[(a > upper) | (a < lower)]

#odstranovanie outlierov pomocou takeho stylu, ze dany atribut, ktory riesime, rozdelime na 
#dvoje distribucie, podla target hodnoty, a jednotlivym distribuciach hladame outliery, ktore
#nasledne odstranime
def removing_outliers_per_class(data, column, clz="class"):

    data = data.copy()
    
    data_y0 = data[data[clz] == 0][column]
    data_y1 = data[data[clz] == 1][column]
        
    idx = identify_outliers(data_y0).index.values
    data = data.drop(index=idx)

    idx = identify_outliers(data_y1).index.values
    data = data.drop(index=idx)
    
    return data

Ako mozno vidiet, tu mame zobrazene spojite atributy, ktorym boli aplikovane, resp. neaplikovane non-linear transforamacie, pricom sme pouzili:
- PowerTransformer pre "mean_oxygen", "skewness_oxygen", "kurtosis_oxygen", "skewness_glucose"
- QuantileTransformer pre "age"
- Ziadnu transformaciu pre "std_oxygen", "mean_glucose", "std_glucose", "kurtosis_glucose" a samozrejme ostatne atributy, co su kategorickymi

In [41]:
power_transf_attr = ["mean_oxygen", "skewness_oxygen", "kurtosis_oxygen", "skewness_glucose"]
quant_transf_attr = ["age"]
other_attr = ["std_oxygen", "mean_glucose", "std_glucose", "kurtosis_glucose", "sex", "education"] + vztahy_attr + work_attr

non_linear_transf =  compose.ColumnTransformer(transformers=[
   ("power_transformer", KeepDataFrame(preprocessing.PowerTransformer()), power_transf_attr),
   ("quantile_transformer", KeepDataFrame(preprocessing.QuantileTransformer(output_distribution="normal")), quant_transf_attr),
   ("pass", "passthrough", other_attr)
])

 # Outliers - resampling

Tu sa sustredime na odstranenie outlierov zo spojitych atributov, kedy toto je cast pipelinu, kedy sa pouziva tzv. resampling, a kvoli ktoremu sme museli pouzit specialny pipeline od kniznice imblearn - nejaka podnoz scikit-learnu.

Pre resampling je typicke, ze sa vykonava len pre trenovaci dataset, pri testovacom sa nepouziva.

In [42]:
class OutlierRemoval(base.BaseEstimator):
     
    def __init__(self, columns):
        self.columns = columns
        
    def fit_resample(self, X, y):
        return self.resample(X, y)
                
    def resample(self, X, y):
        
        X = X.copy()
        y = y.copy()
        
        data = X.join(y, how="left")
        clz = "class"
        
        
        for c in self.columns:
            
            data_y0 = data[data[clz] == 0][c]
            data_y1 = data[data[clz] == 1][c]

            idx = identify_outliers(data_y0).index.values
            data = data.drop(index=idx)

            idx = identify_outliers(data_y1).index.values
            data = data.drop(index=idx)
            
        #toto je specialne pre target atribut
        if data[clz].isnull().sum() > 0:
            idx = data[data[clz].isnull()].index.values
            data = data.drop(index=idx)

            
        X = data.drop(columns=["class"])
        y = data["class"]
            
        return X, y

Atributy, pre ktore riesime outliery su medicinske atributy (oxygen + glucose a age.

Takiez vsak pri resamplovani sa berie do uvahy aj samotny target atribut, podla ktoreho sa taktiez odstranuju zaznamy v datasete - pokial target atribut je null.

In [43]:
outlier_columns = oxygen_attr + glucose_attr + ["age"]

# Scaling a Encoding

Posledna etapa pipelinu, kedy uz vsetky atributy su v spravnom tvare, odstranili sme vsetky mozne problemy - missing values, outliers a ine; a uz touto etapou len dalej transformujeme atributy, aby mohli by byt spracovane modelom/estimatorom.

Teda v pripade numerickych atributov sa scaluje (vsade len StandardScaler) - lepsie vysledky pri trenovani modelu, 

a v pripade kategorickych atributov sa ciselne encoduju dane atributy, lebo vacsina modelov dokaze narabat len s ciselnymi atributami. Tu riesime otazku, ktory encoding je vhodny pre aky atribut, no drzali sme sa pravidla, ze nominalne atributy budu zaencodovane OneHotEncoder-om, zatial co ordinalne budu ce.OrdinalEncoder. Dovodom, preco sme si vybrali category_encoders Ordinal encoder a nie zo scikit learnu, je fakt, ze mi vieme urcit konkretny mapping hodnot. Bez toho neviem vobec, ci by sa pouzival ten OrdinalEncoder, kedze my chceme nim ukazat vztah medzi hodnotami daneho kategorickeho atributu, a pokial by mal zle namapovane hodnoty, tak potom dany vztah by sme dobre nezobrazili....

Ako mozno vidiet, vybrali sme:
- OrdinalEncoder pre education, income a hours-per-week - ordinalne atributy
- OneHotEncoder pre ostatne kategoricke atributy - nominalne atributy

In [44]:
scaling = pipeline.Pipeline(steps=[
    ("standard_scaler", preprocessing.StandardScaler())
])

onehot = pipeline.Pipeline(steps=[
    ("one_hot_enc", preprocessing.OneHotEncoder(handle_unknown="ignore"))
])

ord_mapping = [
    {"col": "education", "mapping": {
        "Attending-school": 1, 
        "HS-grad": 2,
        "Edu after HS, no uni": 3,
        "Some-college": 4,
        "Bachelors": 5,
        "Masters/Doctorate": 6}},
    
    {"col": "hours-per-week-cat", "mapping": {
        "<=35": 1,
        "35< hours <=45": 2,
        ">45": 3}},
    
    {"col": "income", "mapping": {
        "<=50K": 1,
        ">50K": 2}}
]


ordinal = pipeline.Pipeline(steps=[
    ("ordinal_enc", ce.OrdinalEncoder(mapping=ord_mapping, handle_unknown="return_nan")),
    ("impute_unknown", impute.SimpleImputer(strategy="most_frequent"))
])


In [45]:
scaling_attr = ["age"] + oxygen_attr + glucose_attr

onehot_attr = ["sex", "marital-status", "relationship", "occupation", "workclass"]

ordinal_attr = ["education", "hours-per-week-cat", "income"]

last_col_transf = compose.ColumnTransformer(transformers=[
    ("num_attr_scaling", scaling, scaling_attr),
    ("cat_attr_onehot_enc", onehot, onehot_attr),
    ("cat_attr_ordinal_enc", ordinal, ordinal_attr)
])

Pipeline ocakava, ze ako posledny krok dostane model, no my este nepracujeme so modelom, my chceme, aby nam ten pipeline vratil dataset, ktory presiel vsetkymi krokmi pipelinu, a tak vytvorime triedu, ktora sa bude hrat na model/estimator, no bude sluzit na vratenie datasetu, ktory prechadza cez dany pipeline.

In [46]:
#tato trieda sa hra na klasifikator, aby mohla byt poslednym krokom v pipeline
#sluzi na to, aby sme vedeli z pipelinu dostat nove X a y, ktore uz mozme rovno hodit do nejakeho modelu
class Return_X_y(base.BaseEstimator, base.ClassifierMixin):
    
    def fit(self, X, y=None):
        
        return self
    
    def fit_predict(self, X, y=None):
        self.fit(X,y)
        return self.predict(X,y)
    
    def predict(self, X, y=None):
        
        if y is None:
            return X.reset_index(drop=True)
        
        y = y.values
        return X.reset_index(drop=True), y


Vytvaram nazvy pre stlpce, ktore bude mat dataframe, ktory pipeline vracia

In [47]:
custom_cols_names = scaling_attr.copy()
pocet_values = [2, 4, 4, 9, 2]

for col, pocet in zip(onehot_attr, pocet_values):
    for i in range(pocet):
        custom_cols_names.append(col+"_"+str(i))
    
custom_cols_names += ordinal_attr
custom_cols_names

['age',
 'mean_oxygen',
 'std_oxygen',
 'kurtosis_oxygen',
 'skewness_oxygen',
 'mean_glucose',
 'std_glucose',
 'kurtosis_glucose',
 'skewness_glucose',
 'sex_0',
 'sex_1',
 'marital-status_0',
 'marital-status_1',
 'marital-status_2',
 'marital-status_3',
 'relationship_0',
 'relationship_1',
 'relationship_2',
 'relationship_3',
 'occupation_0',
 'occupation_1',
 'occupation_2',
 'occupation_3',
 'occupation_4',
 'occupation_5',
 'occupation_6',
 'occupation_7',
 'occupation_8',
 'workclass_0',
 'workclass_1',
 'education',
 'hours-per-week-cat',
 'income']

# Pipeline v akcii

Tu si mozeme skontrolovat, ci vsetko ide, ako ma. Pipeline pri trenovani (fit) vracia tuple (X,y), pri evaluacii (predict), vracia uz len X

In [48]:
MAIN_PIPELINE = imblearn.pipeline.Pipeline(steps=[
        #prvotne preprocessing stepy
        ("feature_removal", preprocessing.FunctionTransformer(remove_useless_features)),
        ("add_oxygen_attr", preprocessing.FunctionTransformer(add_oxygen_features)),
        ("mean_glucose_to_num", preprocessing.FunctionTransformer(repair_mean_glucose)),
        ("string_formatting", preprocessing.FunctionTransformer(string_wrap_formatting)),
        ("bucket_cat_attr", preprocessing.FunctionTransformer(bucket_cat_attr)),

        #imputacia
        ("imputation_stage",  WrapColumnTransformer(impute_col_transf)),

        #non-linear transformacie
        ("non_linear_transform", WrapColumnTransformer(non_linear_transf)),

        #resampling - outlier removal
        ("outlier_removal", OutlierRemoval(outlier_columns)),

        #scaling and encoding - tu uz nechceme, aby si wrapper okolo ColumnTransformer pamatal nazvy stlpcov, kedze ich tam bude ovela viac
        #kvoli OneHot encodingu; avsak si stale chceme pamatat index
        ("scaling_n_encoding_stage", WrapColumnTransformer(last_col_transf, keep_original_cols=False, custom_cols_names=custom_cols_names)),

        #vratenie datasetu po aplikovani krokov tohto pipelinu
        ("return_X_y", Return_X_y())

    ])

In [49]:
MAIN_PIPELINE02 = imblearn.pipeline.Pipeline(steps=[
        #prvotne preprocessing stepy
        ("feature_removal", preprocessing.FunctionTransformer(remove_useless_features)),
        ("add_oxygen_attr", preprocessing.FunctionTransformer(add_oxygen_features)),
        ("mean_glucose_to_num", preprocessing.FunctionTransformer(repair_mean_glucose)),
        ("string_formatting", preprocessing.FunctionTransformer(string_wrap_formatting)),
        ("bucket_cat_attr", preprocessing.FunctionTransformer(bucket_cat_attr)),

        #imputacia
        ("imputation_stage",  WrapColumnTransformer(simple_impute_col_transf)),

        #non-linear transformacie
        ("non_linear_transform", WrapColumnTransformer(non_linear_transf)),

        #resampling - outlier removal
        ("outlier_removal", OutlierRemoval(outlier_columns)),

        #scaling and encoding - tu uz nechceme, aby si wrapper okolo ColumnTransformer pamatal nazvy stlpcov, kedze ich tam bude ovela viac
        #kvoli OneHot encodingu; avsak si stale chceme pamatat index
        ("scaling_n_encoding_stage", WrapColumnTransformer(last_col_transf, keep_original_cols=False, custom_cols_names=custom_cols_names)),

        #vratenie datasetu po aplikovani krokov tohto pipelinu
        ("return_X_y", Return_X_y())

    ])

In [50]:
X,y = one_proper_df(df1, df2, return_X_y=True)
new_data = MAIN_PIPELINE.fit_predict(X,y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_dataset.iloc[0][attr] = not_null.values[0]


In [51]:
#toto je shape noveho X a y
print(new_data[0].shape)
print(new_data[1].shape)

(3259, 33)
(3259,)


Tu mozno vidiet, ze ked uz netrenujeme model, tak nedochadza ku resamplingu.

In [52]:
MAIN_PIPELINE.predict(X).shape

(3933, 33)

# Ulozenie datasetu

In [53]:
data = pd.DataFrame(new_data[0])
y = pd.Series(new_data[1])

data["class"] = y
data

Unnamed: 0,age,mean_oxygen,std_oxygen,kurtosis_oxygen,skewness_oxygen,mean_glucose,std_glucose,kurtosis_glucose,skewness_glucose,sex_0,...,occupation_5,occupation_6,occupation_7,occupation_8,workclass_0,workclass_1,education,hours-per-week-cat,income,class
0,0.310111,-0.972031,-0.687233,0.765508,0.726842,0.630839,-0.395185,-0.589895,-0.353639,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0
1,-0.203026,-1.170673,-0.669380,1.158494,1.017945,0.054246,1.331188,-0.540950,-1.339714,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,4.0,2.0,1.0,0.0
2,-0.808715,-1.085755,-0.900603,1.527256,1.719060,0.172618,-1.247499,-0.573263,0.585982,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,2.0,2.0,1.0,0.0
3,-1.965275,-1.330474,-0.862698,1.599802,1.590142,1.091691,1.428397,-0.675144,-0.852618,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0
4,-0.439550,-0.161460,-0.178273,0.131985,0.036728,0.054246,1.121753,-0.525753,-0.846696,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,2.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3254,-1.500037,-0.726585,-0.738672,0.692509,0.811700,0.733436,0.768259,-0.634514,-0.599574,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0
3255,-0.025927,-0.249761,-0.510667,0.252255,0.278633,-0.022294,-1.262842,-0.343996,0.541619,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,2.0,2.0,0.0
3256,-0.514980,-0.987045,-0.721506,1.083294,1.055990,0.943414,-0.192728,-0.531069,-0.660675,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,3.0,2.0,0.0
3257,0.958356,1.789480,2.795609,-1.707520,-1.914925,-1.551212,-1.517099,1.725224,1.643703,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,1.0


In [26]:
data.isnull().sum()

age                   0
mean_oxygen           0
std_oxygen            0
kurtosis_oxygen       0
skewness_oxygen       0
mean_glucose          0
std_glucose           0
kurtosis_glucose      0
skewness_glucose      0
sex_0                 0
sex_1                 0
marital-status_0      0
marital-status_1      0
marital-status_2      0
marital-status_3      0
relationship_0        0
relationship_1        0
relationship_2        0
relationship_3        0
occupation_0          0
occupation_1          0
occupation_2          0
occupation_3          0
occupation_4          0
occupation_5          0
occupation_6          0
occupation_7          0
occupation_8          0
workclass_0           0
workclass_1           0
education             0
hours-per-week-cat    0
income                0
class                 0
dtype: int64

In [27]:
data.to_csv("./preprocessed_data/train.csv")