In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
import json
import scipy.stats as stats
import math

import category_encoders as ce

import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms

import vizualizacia_funkcie as visual

from sklearn import preprocessing
from sklearn import pipeline
from sklearn import base
from sklearn import compose
from sklearn import feature_selection
from sklearn import model_selection

from datetime import datetime
from datetime import date

import imblearn
import preprocessing_pipeline as prep_pip

In [2]:
df1 = pd.read_csv("./data/personal_train.csv", index_col=0)
df2 = pd.read_csv("./data/other_train.csv", index_col=0)

print("Dlzka personal_train", df1.shape[0])
print("Pocet unique pacientov", df1["name"].nunique())
print("Rozdiel medzi velkostou df a poctu pacientov", df1.shape[0] - df1["name"].nunique())
print()

print("Dlzka other_train", df2.shape[0])
print("Pocet unique pacientov", df2["name"].nunique())
print("Rozdiel medzi velkostou df a poctu pacientov", df2.shape[0] - df2["name"].nunique())
print()

X1,y1 = prep_pip.one_proper_df(df1, df2)
print("Dlzka noveho dataframu", X1.shape[0])

Dlzka personal_train 3933
Pocet unique pacientov 3933
Rozdiel medzi velkostou df a poctu pacientov 0

Dlzka other_train 3983
Pocet unique pacientov 3933
Rozdiel medzi velkostou df a poctu pacientov 50

Dlzka noveho dataframu 3933


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mini_dataset.iloc[0][attr] = not_null.values[0]


In [3]:
df1 = pd.read_csv("./data/personal_valid.csv", index_col=0)
df2 = pd.read_csv("./data/other_valid.csv", index_col=0)

print("Dlzka personal_valid", df1.shape[0])
print("Pocet unique pacientov", df1["name"].nunique())
print("Rozdiel medzi velkostou df a poctu pacientov", df1.shape[0] - df1["name"].nunique())
print()

print("Dlzka other_valid", df2.shape[0])
print("Pocet unique pacientov", df2["name"].nunique())
print("Rozdiel medzi velkostou df a poctu pacientov", df2.shape[0] - df2["name"].nunique())
print()

X2,y2 = prep_pip.one_proper_df(df1, df2)
print("Dlzka noveho dataframu", X2.shape[0])

Dlzka personal_valid 1311
Pocet unique pacientov 1311
Rozdiel medzi velkostou df a poctu pacientov 0

Dlzka other_valid 1361
Pocet unique pacientov 1311
Rozdiel medzi velkostou df a poctu pacientov 50

Dlzka noveho dataframu 1311


In [4]:
X1["class"] = y1
X2["class"] = y2

In [5]:
data = pd.concat([X1,X2])
data.shape

(5244, 25)

Je potrebne vymazat z datasetu vsetky data, kde target attribute - class je NaN

In [6]:
data = data.reset_index(drop=True)
indices = data.loc[data["class"].isnull()].index.values

data = data.drop(index=indices)
data = data.reset_index(drop=True)

data.shape

(5227, 25)

Toto je tam len kvoli tomu, aby sme vobec vedeli deklarovat dany pipeline - Teda je potrebne tam dat nejaky krok, ktory hned vymazeme.

Tu sa pridaju kroky z preprocessingu

In [7]:
prep_steps = prep_pip.get_preprocessing_steps()

pip = imblearn.pipeline.Pipeline(steps=[
    step for step in prep_steps
])
    
from sklearn import tree
pip.steps.append(
    ("classifier", tree.DecisionTreeClassifier())
)

Tu sa specialne volaju parametre, kedze sa nachadzaju v pipeline...

In [8]:
param_grid = {
    "classifier__criterion" : ["gini", "entropy"],
    "classifier__max_depth" :  [3,5,10]
}

Ked sa pouzije n_jobs=-1, co by malo byt tolko procesov paralelne ako pocet threadov, tak to dava NaN vysledky :(

Takze budeme musiet to mat pomale...

In [9]:
grid_search = model_selection.GridSearchCV(pip, param_grid, scoring="accuracy", cv=10, verbose=1, refit=False)

In [10]:
X = data.drop(columns=["class"])
y = data["class"]

grid_search.fit(X,y)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  3.0min finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('feature_removal',
                                        FunctionTransformer(func=<function remove_useless_features at 0x000001EF961CD1F0>)),
                                       ('add_oxygen_attr',
                                        FunctionTransformer(func=<function add_oxygen_features at 0x000001EF961CD280>)),
                                       ('mean_glucose_to_num',
                                        FunctionTransformer(func=<function repair_mean_glucose at 0x000001EF961CD550>)),
                                       ('string_...
                                                                                 'occupation_1',
                                                                                 'occupation_2',
                                                                                 'occupation_3',
                                                                                 'occupat

Tu su vysledky...

In [11]:
grid_search.cv_results_

{'mean_fit_time': array([2.91839316, 2.56954417, 2.58535528, 2.7628459 , 2.74674795,
        2.37328324]),
 'std_fit_time': array([0.16967288, 0.13973361, 0.12681291, 0.13668663, 0.18467865,
        0.22084841]),
 'mean_score_time': array([0.34487875, 0.27607479, 0.28568659, 0.30517044, 0.30639613,
        0.27481587]),
 'std_score_time': array([0.05809854, 0.03113603, 0.0280337 , 0.02876131, 0.02755588,
        0.02557769]),
 'param_classifier__criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__max_depth': masked_array(data=[3, 5, 10, 3, 5, 10],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__criterion': 'gini', 'classifier__max_depth': 3},
  {'classifier__criterion': 'gini', 'classifier__max_depth': 5},

In [12]:
grid_search.best_score_

0.8901811681794539

In [13]:
grid_search.best_params_

{'classifier__criterion': 'gini', 'classifier__max_depth': 5}