In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import json
import scipy.stats as stats
import math

import category_encoders as ce

import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms

import vizualizacia_funkcie as visual

from sklearn.experimental import enable_iterative_imputer 
from sklearn import impute 
from sklearn import preprocessing
from sklearn import pipeline
from sklearn import base
from sklearn import compose
from sklearn import feature_selection

from datetime import datetime
from datetime import date

import imblearn

In [2]:
df1 = pd.read_csv("./data/personal_train.csv", index_col=0)
df2 = pd.read_csv("./data/other_train.csv", index_col=0)

# Scikit Pipeline

In [3]:
#funckia, ktora mergne zaznamy, ktore su rovnake
def piece_datarows_together(data):
    
    data = data.copy().set_index("name")
    
    #toto nam vrati dataset, ktory obsahuje vsetky duplikaty, s ktorymi budeme pracovat
    #proste to vrati data, ktore maju index, ktory je v datasete viac ako raz pouzity
    duplicated = data[data.index.duplicated(keep=False)]
    
    index_values = duplicated.index.unique()
    
    #najprv vsetky hodnoty prenesieme do prveho vyskytu zaznamu daneho pacienta v datasete
    for idx in index_values:
        mini_dataset = duplicated.loc[idx] #toto vrati viacero zaznamov s rovnakych idx
        
        #zistim si, ktore atributy su nullove pre presne prvy zaznam a pre konkretne nullove atributy budem nadalej hladat
        #nenullovu hodnotu v ostatnych zaznamoch s rovnakym idx
        missing_mask = mini_dataset.iloc[0].isnull()
        attributes = mini_dataset.columns.values
        missing_attributes = attributes[missing_mask]
        
        #tu replacujem null hodnoty za nenullove
        for attr in missing_attributes:
            not_null = mini_dataset[attr][mini_dataset[attr].notnull()]
            
            if len(not_null) != 0:
                mini_dataset.iloc[0][attr] = not_null.values[0]
        
        
    #teraz uz mozme vymazat vsetky druhe, resp. ostatne zaznamy pacienta
    duplicated_mask = data.index.duplicated(keep="first")
    
    data = data.reset_index()
    duplicated_indices = data.index.values[duplicated_mask]
    
    
    return data.drop(index=duplicated_indices).reset_index(drop=True)


In [4]:
def one_proper_df(df1, df2, return_X_y=True):
    data = df1.drop(columns=["address"]).set_index("name").join(df2.set_index("name"), how="right").reset_index()
    data = piece_datarows_together(data)
    
    if return_X_y == True:
        X = data.drop(columns=["class"])
        y = data["class"]
        return X,y
    
    else:
        return data 
    

In [5]:
def marital_status_categories(row):
    
    ms = row["marital-status"]
        
    if ms is not np.nan and ms not in ("Divorced", "Never-married", "Married-civ-spouse"):
        row["marital-status"] = "Other"
        
    return row

def relationship_categories(row):
    
    rel = row["relationship"]
        
    if rel is not np.nan and rel not in ("Not-in-family", "Husband", "Own-child"):
        row["relationship"] = "Other"
        
    return row

def occupation_categories(row):

    occ = row["occupation"]
    
    if occ is not np.nan and occ not in ("Craft-repair", "Prof-specialty", "Exec-managerial", 
                                         "Adm-clerical", "Sales", "Other-service", "Machine-op-inspct", 
                                         "Transport-moving"):
        
        row["occupation"] = "Other"
        
    return row

def workclass_categories(row):

    wc = row["workclass"]
    
    if wc is not np.nan and wc != "Private":
        row["workclass"] = "Non-private"
        
    return row

def categorize_hours(row):
    
    hour = row["hours-per-week"]
    
    if math.isnan(hour):
        row["hours-per-week-cat"] = math.nan
    elif hour <= 35:
        row["hours-per-week-cat"] = "<=35"
    elif hour <= 45:
        row["hours-per-week-cat"] = "35< hours <=45"
    elif hour > 45:
        row["hours-per-week-cat"] = ">45"        

    return row

def simplify_education(row):
        
    edu = row["education"]
        
    if edu is np.nan:
        row["simple-edu"] = edu
        
    elif re.match("^([0-9][a-zA-Z])|(1[0-2][a-zA-Z])", edu) or edu == "Preschool":
        row["simple-edu"] = "Attending-school"
        
    elif edu in ["Assoc-acdm", "Assoc-voc", "Prof-school"]:
        row["simple-edu"] = "Edu after HS, no uni"
        
    elif edu in ["Masters", "Doctorate"]:
        row["simple-edu"] = "Masters/Doctorate"
        
    else:
        row["simple-edu"] = row["education"]
    
    return row

In [6]:
def date_formatting(data):    
    
    data = data.copy()
    
    import re
    dates = []

    for index,row in data.iterrows():
        dates.append(re.sub('\d', '*',  row['date_of_birth']))

    dates = list(set(dates))
    dates

    import re
    from datetime import datetime

    for index,row in data.iterrows():
        line = row['date_of_birth']
        if re.match(r"^\d{2}-\d{2}-\d{2}$", line):
            regex1 = line[0:2]
            regex2 = line[3:5]
            regex3 = line[6:8]

            verbose = False
            if (verbose == True):
                if (int(regex1) > 31):
                    print('Prvy udaj > 31: ',regex1)
                if (int(regex2) > 31):
                    print('Druhy udaj > 31: ',regex2)
                if (int(regex3) > 31):
                    print('Treti udaj > 31: ',regex3)

    data['date_of_birth'] = data['date_of_birth'].map(lambda x: x[:10])

    import re
    from datetime import datetime

    for index,row in data.iterrows():
        line = row['date_of_birth']
        dateObj = None
        if re.match(r"^\d{2}-", line):
            newDate = '19' + line
            dateObj = datetime.strptime(newDate,'%Y-%m-%d')
        elif re.match(r"^\d{4}-", line):
            dateObj = datetime.strptime(line,'%Y-%m-%d')
        elif re.match(r"^\d{4}/", line):
            dateObj = datetime.strptime(line,'%Y/%m/%d')
        elif re.match(r"^\d{2}/", line):
            dateObj = datetime.strptime(line,'%d/%m/%Y')
        data.at[index,'date_of_birth'] = dateObj.strftime('%d-%m-%Y')
    
    return data


In [7]:
def remove_useless_features(X):
    
    X = X.copy()
    
    useless_cols = ["name", "race", "pregnant", "capital-loss", "capital-gain", "fnlwgt", "native-country", "address"]
    
    return X.drop(columns=useless_cols)

def add_oxygen_features(X):
    X = X.copy()
    X = X.apply(get_oxygen_stats, axis=1)
    return X.drop(columns=["medical_info"])
    
def get_oxygen_stats(row):
    
    string = row["medical_info"]
    
    if string is np.nan:
        return row
    
    string = string.replace("\'", "\"")
    di = json.loads(string)
    
    for k in di.keys():
        row[k] = float(di[k])
        
    return row

def string_wrap_formatting(X):
    X = X.copy()
    return X.apply(string_formatting, axis=0)

def string_formatting(col):
    
    if col.dtype == "O":
        col = col.apply(lambda row: row.strip() if row is not np.nan else row)
        col = col.apply(lambda row: np.nan if row is not np.nan and row == "?" else row)
    
    return col

def bucket_cat_attr(X):
   
    X = X.copy()
    
    X = X.apply(marital_status_categories, axis=1)
    X = X.apply(relationship_categories, axis=1)
    X = X.apply(occupation_categories, axis=1)
    X = X.apply(workclass_categories, axis=1)
    
    X["hours-per-week-cat"] = 0
    X = X.apply(categorize_hours, axis=1)
    X = X.drop(columns=["hours-per-week"])
    
    return X

def repair_mean_glucose(X):
    
    X = X.copy()
    X["mean_glucose"] = pd.to_numeric(X['mean_glucose'], errors= 'coerce')
    return X

def prepare_age(X):
    X = X.copy()
    X = date_formatting(X)
    
    X = X.apply(make_bs_age_nan, axis=1)
    X = X.apply(calculate_age, axis=1)
    
    X = X.drop(columns=["date_of_birth"])
    
    return X
    
def make_bs_age_nan(row):
    
    age = row["age"]
    
    if age is np.nan:
        return row
    
    if age <= 0 or age >= 100:
        row["age"] = np.nan
        
    return row

def calculate_age(row):
    
    if row["age"] is np.nan or math.isnan(row["age"]):
    
        born = row["date_of_birth"]

        born = datetime.strptime(born, "%d-%m-%Y").date()
        today = date.today()
        
        row["age"] = today.year - born.year - ((today.month, today.day) < (born.month, born.day))
 
        
    return row

In [8]:
prvy_pipeline = pipeline.Pipeline(steps=[
    ("feature_removal", preprocessing.FunctionTransformer(remove_useless_features)),
    ("add_oxygen_attr", preprocessing.FunctionTransformer(add_oxygen_features)),
    ("mean_glucose_to_num", preprocessing.FunctionTransformer(repair_mean_glucose)),
    ("string_formatting", preprocessing.FunctionTransformer(string_wrap_formatting)),
    ("bucket_cat_attr", preprocessing.FunctionTransformer(bucket_cat_attr)),
    ("preprocess_age", preprocessing.FunctionTransformer(prepare_age))
    
])

# Imputing

In [9]:
#tato funkcia je ekvivalentna s hociktorym inym transformatorom v scikit-learne, ci uz na imputaciu, transformaciu, scaling a ine veci
#jediny rozdiel je, ze to nevrati numpy array, ale DataFrame, a vdaka tomu si uchvoavam nazvy stlpcov
class KeepDataFrame(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, transformation):
        self.transformation = transformation
    
    def fit(self, X, y=None):
        
        if self.transformation is not None:
            self.transformation.fit(X)
        return self
    
    def transform(self, X):
        
        if self.transformation is not None:
        
            X = X.copy()
            cols = X.columns
            indices = X.index

            X = self.transformation.transform(X)

            X = pd.DataFrame(X, columns=cols, index=indices)
        
        return X

In [10]:
class CustomCatImputing(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, imputer_type="knn"):
        self.ordinal_encoder = None
        self.imputer = None
        self.imputer_type = imputer_type
        
    def fit(self, X, y=None):
        
        print("FIT")
        
        X = X.copy()
        
        columns = X.columns.values
        indices = X.index

        null_values = pd.DataFrame(index=pd.Index([-1]), columns=columns, data=[[np.nan for i in range(len(columns))]])
        X = pd.concat([null_values,X])

        self.ordinal_encoder = ce.ordinal.OrdinalEncoder(handle_missing="return_nan", handle_unknown="return_nan")
        X = self.ordinal_encoder.fit_transform(X)
        
        X = X[1:]
        
        if self.imputer_type == "knn":
            self.imputer = impute.KNNImputer()
            X = self.imputer.fit(X)
        
        elif self.imputer_type == "iterative":

            self.imputer = impute.IterativeImputer(max_iter=20, random_state=42, initial_strategy="most_frequent", 
                                                  min_value=X.min(), max_value=X.max())


            try:
                X = self.imputer.fit(X)
            except (ValueError, np.linalg.LinAlgError):
                print("Jeden error bol trapnuty, kedy funkcii vadili NaNs. Tento error je ale divny, lebo mu to vadi", \
                  "len prvy krat, a potom to uz ide...")
                X = self.imputer.fit(X)
            
        return self
               

    def transform(self, X):
        print("TRANSFORM")
        
        X = X.copy()
        
        indices = X.index
        columns = X.columns
    
        X = self.ordinal_encoder.transform(X)
        X = self.imputer.transform(X).round()
        
        X = pd.DataFrame(data=X, columns=columns, index=indices)
        
        X = self.ordinal_encoder.inverse_transform(X)
        
        return X
    

In [11]:
class WrapColumnTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, column_transformer, keep_original_cols=True):
        self.column_transformer = column_transformer
        self.keep_original_cols = keep_original_cols
    
    def fit(self, X, y=None):
        self.column_transformer.fit(X)
        return self
        
            
    def transform(self, X):
        indices = X.index
        
        columns = []
        
        for transf in self.column_transformer.transformers:
            columns += transf[2]
           

        X = X.copy()
        
        X = self.column_transformer.transform(X)

        if self.keep_original_cols == True:
            X = pd.DataFrame(X, columns=columns, index=indices)
        
        else:
            X = pd.DataFrame(X, index=indices)
        
        return X

In [12]:
oxygen_attr = ["mean_oxygen", "std_oxygen", "kurtosis_oxygen", "skewness_oxygen"]
glucose_attr = ["mean_glucose", "std_glucose", "kurtosis_glucose", "skewness_glucose"]

vztahy_attr = ["relationship", "marital-status"]
work_attr = ["workclass", "occupation", "hours-per-week-cat", "income"]
edu_attr = ["education", "education-num"]

impute_col_transf = compose.ColumnTransformer(transformers=[
    ("oxygen_n_glucose_impute", KeepDataFrame(impute.IterativeImputer(max_iter=50)), oxygen_attr + glucose_attr),
    ("vztahy_impute", CustomCatImputing(imputer_type="knn"), vztahy_attr),
    ("work_impute", CustomCatImputing(imputer_type="knn"), work_attr),
    ("edu_impute", CustomCatImputing(imputer_type="knn"), edu_attr),
    ("sex_impute", KeepDataFrame(impute.SimpleImputer(strategy="most_frequent")), ["sex"]),
    ("age_impute", KeepDataFrame(impute.SimpleImputer()), ["age"])
])

# NonLinearTransf

In [13]:
def identify_outliers(a):
    q25 = a.quantile(0.25)
    q75 = a.quantile(0.75)
    
    iqr = q75-q25
        
    lower = q25 - 1.5 * iqr
    upper = q75 + 1.5 * iqr
    
    return a[(a > upper) | (a < lower)]

def removing_outliers_per_class(data, column, clz="class"):

    data = data.copy()
    
    data_y0 = data[data[clz] == 0][column]
    data_y1 = data[data[clz] == 1][column]
        
    idx = identify_outliers(data_y0).index.values
    data = data.drop(index=idx)

    idx = identify_outliers(data_y1).index.values
    data = data.drop(index=idx)
    
    return data

In [14]:
power_transf_attr = ["mean_oxygen", "skewness_oxygen", "kurtosis_oxygen", "skewness_glucose"]
quant_transf_attr = ["age"]
other_attr = ["std_oxygen", "mean_glucose", "std_glucose", "kurtosis_glucose", "sex", "education"] + vztahy_attr + work_attr

non_linear_transf =  compose.ColumnTransformer(transformers=[
   ("power_transformer", KeepDataFrame(preprocessing.PowerTransformer()), power_transf_attr),
   ("quantile_transformer", KeepDataFrame(preprocessing.QuantileTransformer(output_distribution="normal")), quant_transf_attr),
   ("pass", "passthrough", other_attr)
])

 # Outliers - resampling

In [15]:
class OutlierRemoval(base.BaseEstimator):
     
    def __init__(self, columns):
        self.columns = columns
        
    def fit_resample(self, X, y):
        print("fit_resample")
        return self.resample(X, y)
                
    def resample(self, X, y):
        print("resample")
        
        X = X.copy()
        y = y.copy()
        
        data = X.join(y, how="left")
        clz = "class"
        
        
        for c in self.columns:
            
            data_y0 = data[data[clz] == 0][c]
            data_y1 = data[data[clz] == 1][c]

            idx = identify_outliers(data_y0).index.values
            data = data.drop(index=idx)

            idx = identify_outliers(data_y1).index.values
            data = data.drop(index=idx)
            
        #toto je specialne pre target atribut
        if data[clz].isnull().sum() > 0:
            idx = data[data[clz].isnull()].index.values
            data = data.drop(index=idx)

            
        X = data.drop(columns=["class"])
        y = data["class"]
            
        return X, y

In [16]:
outlier_columns = oxygen_attr + glucose_attr + ["age"]

# Scaling a Encoding

In [17]:
scaling = pipeline.Pipeline(steps=[
    ("standard_scaler", preprocessing.StandardScaler())
])

onehot = pipeline.Pipeline(steps=[
    ("one_hot_enc", preprocessing.OneHotEncoder(handle_unknown="ignore"))
])

ord_mapping = [
    {"col": "education", "mapping": {
        "Attending-school": 1, 
        "HS-grad": 2,
        "Edu after HS, no uni": 3,
        "Some-college": 4,
        "Bachelors": 5,
        "Masters/Doctorate": 6}},
    
    {"col": "hours-per-week-cat", "mapping": {
        "<=35": 1,
        "35< hours <=45": 2,
        ">45": 3}},
    
    {"col": "income", "mapping": {
        "<=50K": 1,
        ">50K": 2}}
]


ordinal = pipeline.Pipeline(steps=[
    ("ordinal_enc", ce.OrdinalEncoder(mapping=ord_mapping, handle_unknown="return_nan")),
    ("impute_unknown", impute.SimpleImputer(strategy="most_frequent"))
])


In [18]:
scaling_attr = ["age"] + oxygen_attr + glucose_attr

onehot_attr = ["sex", "marital-status", "relationship", "occupation", "workclass"]

ordinal_attr = ["education", "hours-per-week-cat", "income"]

last_col_transf = compose.ColumnTransformer(transformers=[
    ("num_attr_scaling", scaling, scaling_attr),
    ("cat_attr_onehot_enc", onehot, onehot_attr),
    ("cat_attr_ordinal_enc", ordinal, ordinal_attr)
])

In [30]:
#tato trieda sa hra na klasifikator, aby mohla byt poslednym krokom v pipeline
#sluzi na to, aby sme vedeli z pipelinu dostat nove X a y, ktore uz mozme rovno hodit do nejakeho modelu
class Return_X_y(base.BaseEstimator, base.ClassifierMixin):
    
    def fit(self, X, y=None):
        
        return self
    
    def fit_predict(self, X, y=None):
        self.fit(X,y)
        return self.predict(X,y)
    
    def predict(self, X, y=None):
        if y is None:
            return X
        
        y = y.values
        return X,y

In [31]:
MAIN_PIPELINE = imblearn.pipeline.Pipeline(steps=[
    ("feature_removal", preprocessing.FunctionTransformer(remove_useless_features)),
    ("add_oxygen_attr", preprocessing.FunctionTransformer(add_oxygen_features)),
    ("mean_glucose_to_num", preprocessing.FunctionTransformer(repair_mean_glucose)),
    ("string_formatting", preprocessing.FunctionTransformer(string_wrap_formatting)),
    ("bucket_cat_attr", preprocessing.FunctionTransformer(bucket_cat_attr)),
    ("imputation_stage",  WrapColumnTransformer(impute_col_transf)),
    ("non_linear_transform", WrapColumnTransformer(non_linear_transf)),
    ("outlier_removal", OutlierRemoval(outlier_columns)),
    ("scaling_n_encoding_stage", WrapColumnTransformer(last_col_transf, keep_original_cols=False)),
    ("var_thresh", feature_selection.VarianceThreshold(threshold=(0.9*(1-0.9)))),
    ("return_X_y", Return_X_y())
    
])

In [32]:
X,y = one_proper_df(df1, df2, return_X_y=True)

new_data = MAIN_PIPELINE.fit_predict(X,y)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_dataset.iloc[0][attr] = not_null.values[0]


FIT
TRANSFORM
FIT
TRANSFORM
FIT
TRANSFORM
TRANSFORM
TRANSFORM
TRANSFORM
fit_resample
resample


In [22]:
new_data[0].shape

(3263, 29)

# Ulozenie datasetu

In [33]:
new_data[1]

array([0., 0., 0., ..., 0., 1., 1.])

In [35]:
data = pd.DataFrame(new_data[0])
y = pd.Series(new_data[1])

data["class"] = y
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,class
0,0.311276,-0.977285,-0.686528,0.764774,0.725310,0.602871,-0.382023,-0.587876,-0.348529,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0
1,-0.202668,-1.180374,-0.668629,1.159486,1.018022,0.577096,1.298356,-0.538880,-1.335921,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,4.0,2.0,1.0,0.0
2,-0.809311,-1.093507,-0.900445,1.530003,1.723676,0.163982,-1.211630,-0.571226,0.591841,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0
3,-1.967692,-1.344030,-0.862443,1.602909,1.593859,1.044280,1.392975,-0.673214,-0.848126,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,1.0,0.0
4,-0.439565,-0.153038,-0.176263,0.128825,0.032148,0.611151,1.094501,-0.523667,-0.842197,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,2.0,3.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,-1.501722,-0.726906,-0.738099,0.691472,0.810619,0.701140,0.750425,-0.632541,-0.594752,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,1.0,0.0
3259,-0.025291,-0.242444,-0.509509,0.249518,0.274989,-0.022706,-1.226564,-0.341722,0.547458,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,5.0,2.0,2.0,0.0
3260,-0.515114,-0.992622,-0.720888,1.083943,1.056290,0.902260,-0.184959,-0.528989,-0.655931,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,5.0,3.0,2.0,0.0
3261,0.960542,1.784448,2.805246,-1.714086,-1.918628,-1.487120,-1.474046,1.729647,1.649441,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,1.0,1.0


In [40]:
data.to_csv("./preprocessed_data/train.csv")