In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import seaborn
import matplotlib.pyplot as plt
import matplotlib
import re

In [3]:
%matplotlib inline

In [4]:
import pandas as pd 
import numpy as np
from collections import defaultdict

In [5]:
from tqdm import tqdm
from tqdm import tqdm_notebook

In [6]:
from xgboost import XGBClassifier

In [7]:
from sklearn.linear_model import LogisticRegression

In [85]:
from sklearn.metrics import roc_curve

In [8]:
from model import *
from transformers import *
from quality import *

In [9]:
train = pd.read_csv("ml5/train.csv", sep=";")
test = pd.read_csv("ml5/test.csv", sep=";")
test["smoke"] = test["smoke"].apply(lambda x: np.nan if (x == "None")  else int(x))
test["alco"] = test["alco"].apply(lambda x: np.nan if (x == "None")  else int(x))
test["active"] = test["active"].apply(lambda x: np.nan if (x == "None")  else int(x))

In [10]:
def clone_nans(df, features):
    for feature in features:
        df1 = df[~df[feature].isnull()].copy()
        df2 = df[df[feature].isnull()].copy()
        df3 = df2.copy()
        df2[feature] = np.zeros(df2.shape[0])
        df3[feature] = np.ones(df3.shape[0])
        df = pd.concat([df1, df2, df3])
        
    return df
test2 = clone_nans(test, ["smoke", "alco", "active"])

In [11]:
ml = Model(train, test2, "cardio")

In [12]:
def ap_lo_func(x, axis):
    if x == 0:
        return 40
    elif x < 20:
        return 10 * x
    elif x < 300:
        return x
    elif x < 2000:
        return x * 1.0 / 10
    else:
        return x * 1.0 / 100

In [13]:
def ap_hi_func(x, axis):
    if x < 0:
        return - x
    elif x < 20:
        return x * 10
    elif x < 500:
        return x
    elif x <= 2000:
        return x * 1.0 / 10
    else:
        return x * 1.0 / 100

In [14]:
def days_to_year(x, axis):
    return x * 1.0 / 365

In [32]:
ml.remove_step("aplopr")
ml.add_step("aplopr", custom_transformer(ap_lo_func, ["ap_lo"], "aplopr"))

In [33]:
ml.remove_step("aphipr")
ml.add_step("aphipr", custom_transformer(ap_hi_func, ["ap_hi"], "aphipr"))

In [34]:
ml.remove_step("year")
ml.add_step("year", custom_transformer(days_to_year, ["age"], "year"))

In [35]:
ml.compute_step("aplopr")

In [36]:
ml.compute_step("aphipr")

In [37]:
ml.compute_step("year")

In [38]:
ml.train.columns

Index([u'id', u'age', u'gender', u'height', u'weight', u'ap_hi', u'ap_lo',
       u'cholesterol', u'gluc', u'smoke', u'alco', u'active', u'cardio',
       u'aplopr', u'aphipr', u'year'],
      dtype='object')

In [39]:
fl = ["year", "gender", "height", "weight", "aphipr", "aplopr", "cholesterol", "gluc", "smoke", "alco", "active"]

In [40]:
lrclf_params_1 = {"C": 0.4, "n_jobs" : -1, "penalty": "l2"}
ml.remove_step("lr_s1")
ml.add_step("lr_s1", logreg_stacking(fl, "cardio", lrclf_params_1, 10, "lr_s1"), prerequisites = ["source"])

Trying to remove undefined step lr_s1.


In [41]:
ml.compute_step("lr_s1")




In [42]:
measure_logloss(ml, "train", "cardio", "lr_s1")

1.1704722949703514

In [43]:
ml.remove_step("mulf1")
ml.add_step("mulf1", multiplication_transformer(["year", "height", "weight", "aphipr", "aplopr"], [-2, -1, 2], 1, "mulf1"), prerequisites = ["year", "aphipr", "aplopr"])

Trying to remove undefined step mulf1.


In [44]:
ml.remove_step("mulf2")
ml.add_step("mulf2", multiplication_transformer(["year", "height", "weight", "aphipr", "aplopr"], [-2, -1, 1, 2], 2, "mulf2"), prerequisites = ["year", "aphipr", "aplopr"])

Trying to remove undefined step mulf2.


In [45]:
ml.remove_step("mulf3")
ml.add_step("mulf3", multiplication_transformer(["year", "height", "weight", "aphipr", "aplopr"], [-1, 1], 3, "mulf3"), prerequisites = ["year", "aphipr", "aplopr"])

Trying to remove undefined step mulf3.


In [46]:
ml.compute_step("mulf1")

In [47]:
ml.compute_step("mulf2")

In [48]:
ml.compute_step("mulf3")

In [50]:
fl_lr2 = fl + ml.get_step_columns(["mulf1", "mulf2", "mulf3"])
lrclf_params_1 = {"C": 0.4, "n_jobs" : -1, "penalty": "l2"}

In [51]:
ml.remove_step("lr_s2")
ml.add_step("lr_s2", logreg_stacking(fl_lr2, "cardio", lrclf_params_1, 10, "lr_s2"), prerequisites = ["mulf1", "mulf2", "mulf3"])

Trying to remove undefined step lr_s2.


In [52]:
fl_lr2 = fl + ml.get_step_columns(["mulf1", "mulf2", "mulf3"])
lrclf_params_2 = {"C": 0.2, "n_jobs" : -1, "penalty": "l2"}

In [53]:
ml.remove_step("lr_s3")
ml.add_step("lr_s3", logreg_stacking(fl_lr2, "cardio", lrclf_params_2, 10, "lr_s3"), prerequisites = ["mulf1", "mulf2", "mulf3"])

Trying to remove undefined step lr_s3.


In [54]:
ml.compute_step("lr_s3")
ml.compute_step("lr_s2")







In [58]:
ml.train[ml.get_step_columns("mulf1")].head()

Unnamed: 0,mulf1_15,mulf1_14,mulf1_13,mulf1_12,mulf1_11,mulf1_10,mulf1_9,mulf1_8,mulf1_3,mulf1_2,mulf1_1,mulf1_7,mulf1_6,mulf1_5,mulf1_4
0,6400.0,0.0125,0.000156,12100.0,0.009091,8.3e-05,3844.0,0.016129,2539.331574,0.019845,0.000394,0.00026,28224.0,0.005952,3.5e-05
1,8100.0,0.011111,0.000123,19600.0,0.007143,5.1e-05,7225.0,0.011765,3071.285299,0.018044,0.000326,0.000138,24336.0,0.00641,4.1e-05
2,4900.0,0.014286,0.000204,16900.0,0.007692,5.9e-05,4096.0,0.015625,2669.066984,0.019356,0.000375,0.000244,27225.0,0.006061,3.7e-05
3,10000.0,0.01,0.0001,22500.0,0.006667,4.4e-05,6724.0,0.012195,2331.170043,0.020712,0.000429,0.000149,28561.0,0.005917,3.5e-05
4,3600.0,0.016667,0.000278,10000.0,0.01,0.0001,3136.0,0.017857,2291.917253,0.020888,0.000436,0.000319,24336.0,0.00641,4.1e-05


In [55]:
measure_logloss(ml, "train", "cardio", "lr_s2")

1.1625293957026694

In [56]:
measure_logloss(ml, "train", "cardio", "lr_s3")

1.1605889333734691

In [88]:
ml.remove_step("xgb1")
ml.add_step("xgb1", xgb_prediction(fl_lr2, "cardio", {}, "xgb1"), prerequisites = ["mulf1", "mulf2", "mulf3"])

In [89]:
ml.compute_step("xgb1")

[0]	validation_0-logloss:0.668738
[1]	validation_0-logloss:0.648861
[2]	validation_0-logloss:0.632322
[3]	validation_0-logloss:0.618659
[4]	validation_0-logloss:0.607113
[5]	validation_0-logloss:0.59758
[6]	validation_0-logloss:0.58959
[7]	validation_0-logloss:0.582776
[8]	validation_0-logloss:0.576984
[9]	validation_0-logloss:0.57201
[10]	validation_0-logloss:0.567654
[11]	validation_0-logloss:0.563927
[12]	validation_0-logloss:0.560777
[13]	validation_0-logloss:0.558075
[14]	validation_0-logloss:0.555792
[15]	validation_0-logloss:0.553741
[16]	validation_0-logloss:0.551976
[17]	validation_0-logloss:0.550428
[18]	validation_0-logloss:0.549116
[19]	validation_0-logloss:0.547851
[20]	validation_0-logloss:0.546771
[21]	validation_0-logloss:0.545753
[22]	validation_0-logloss:0.544932
[23]	validation_0-logloss:0.54421
[24]	validation_0-logloss:0.543506
[25]	validation_0-logloss:0.542903
[26]	validation_0-logloss:0.542388
[27]	validation_0-logloss:0.541907
[28]	validation_0-logloss:0.541478

In [95]:
x = ml.test[["xgb1", "id"]].groupby("id").mean()

(30000, 1)