In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')

In [7]:
train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [8]:
Y = train['target']
X = train.drop('target', axis=1)

In [9]:
len(X.columns)

23

In [22]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.ord_dict = {
            "ord_1": {
                "Novice": 1,
                "Contributor": 2,
                "Expert": 3,
                "Master": 4,
                "Grandmaster": 5        
            },

            "ord_2": {
                "Freezing": 1,
                "Cold": 2,
                "Warm": 3,
                "Hot": 4,
                "Boiling Hot": 5,
                "Lava Hot": 5
            }
        }

    def fit(self, df, target=None):
        return self

    def transform(self, df):
        res = df
        for col, mapping in self.ord_dict.items():
            res[col] = res[col].map(mapping)
        return res  

In [11]:
class SeasonEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, month_num_col):
        self.month_num_col = month_num_col

    def __get_season(self, month_num):
        if month_num in [12, 1, 2]:
            return 'winter'
        elif month_num in [3, 4, 5]:
            return 'spring'
        elif month_num in [6, 7, 8]:
            return 'summer'
        elif month_num in [9, 10, 11]:
            return 'autumn'
        else:
            return 'error'

    def fit(self, df, target=None):
        return self

    def transform(self, df):
        res = df
        res['season'] = res[self.month_num_col].apply(self.__get_season)
        return res      

In [12]:
class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def fit(self, df, target=None):
        df_ = df.copy()
        df_[self.cat_cols].fillna('nan', inplace=True)
        df_['target'] = target
        for col in self.cat_cols:
            mapping = df_.groupby(col)['target'].mean()*100
            self.encoded[col] = dict(mapping)
        df_.drop('target', axis=1, inplace=True)
        return self

    def transform(self, df):
        res = df
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, mapping in self.encoded.items():
            res[col+'_mean'] = res[col].map(mapping)
        return res

In [13]:
class BayesEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def fit(self, df, target=None):
        df_ = df
        df_[self.cat_cols].fillna('nan', inplace=True)
        df_['target'] = target
        global_pos_target = df_['target'].mean()
        for col in self.cat_cols:
            global_count = df_[col].count()
            means = df_.groupby(col)['target'].mean()
            probs = df_[col].value_counts()/global_count
            mapping = (means*probs)*100/global_pos_target
            self.encoded[col] = dict(mapping)
        df_.drop('target', axis=1, inplace=True)
        return self

    def transform(self, df):
        res = df.copy()
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, mapping in self.encoded.items():
            res[col+'_bayes'] = res[col].map(mapping)
        return res

In [14]:
class CombinationEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features):
        self.cat_features = cat_features

    def fit(self, df, target=None):
        return self

    def transform(self, df):
        from itertools import combinations
        res = df.copy()
        for comb in combinations(self.cat_features, 2):
            try:
                res[comb[0]+'+'+comb[1]] = res[comb[0]]*res[comb[1]]
            except:
                continue
        return res 

In [26]:
from sklearn.linear_model import LogisticRegression

class LogisticEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.logit = LogisticRegression(n_jobs = -1, random_state = 17)
        self.columns = None
        
    def fit(self, df, target=None):
        df_ = df.select_dtypes(include=['int64', 'float64']).fillna(0)        
        self.logit.fit(df_, y_train)
        self.columns = df_.columns
        return self

    def transform(self, df):
        res = df[self.columns].copy()
        res['logit'] = self.logit.predict_proba(res.fillna(0))[:,1] 
        return res 

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.2, 
                                                    random_state=17)

In [28]:
feat = []
for col in list(X_train.columns):
    feat.append(col+'_mean')
    feat.append(col+'_bayes')

len(feat)

46

In [29]:
from sklearn.pipeline import make_pipeline

ordinal = OrdinalEncoder()
logit = LogisticEncoder()
combination = CombinationEncoder(feat)
season = SeasonEncoder('month')
mean = MeanEncoder(X.columns)
bayes = BayesEncoder(X.columns)

transformer_pipe = make_pipeline(season,
                                 ordinal,
                                 mean,
                                 bayes,
                                 combination,
                                 logit)

Xtrain_transform = transformer_pipe.fit_transform(X_train, y_train)

len(Xtrain_transform.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ ==

1090

In [30]:
%%time

Xtrain_transform = Xtrain_transform.select_dtypes(include=['int64', 'float64']).fillna(0)

CPU times: user 2.84 s, sys: 5.46 s, total: 8.3 s
Wall time: 8.3 s


In [31]:
%%time

X_test_transform = transformer_pipe.transform(X_test).select_dtypes(include=['int64', 'float64']).fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


CPU times: user 7.85 s, sys: 11.9 s, total: 19.7 s
Wall time: 12.9 s


In [32]:
print(len(Xtrain_transform.columns))
print(len(X_test_transform.columns))

1090
1090


In [33]:
X_test_transform.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,ord_0,ord_1,ord_2,day,month,ord_1_mean,ord_0_mean,...,ord_5_bayes+day_bayes,ord_5_bayes+month_mean,ord_5_bayes+month_bayes,day_mean+day_bayes,day_mean+month_mean,day_mean+month_bayes,day_bayes+month_mean,day_bayes+month_bayes,month_mean+month_bayes,logit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
275061,0,0,0,1,5,1,1,1,40.438841,27.75088,...,23.770265,24.797033,11.037467,824.310737,859.917234,382.759817,629.250904,280.087375,292.185883,0.246917
101025,0,0,0,2,1,5,3,8,24.157809,33.575256,...,12.571146,20.244317,4.146885,566.627801,912.485843,186.915372,658.650614,134.919271,217.271239,0.441129
75873,0,0,0,2,1,5,1,10,24.157809,33.575256,...,8.738083,12.542737,3.038954,824.310737,1183.224342,286.680972,865.833312,209.780958,301.121803,0.607659
237475,0,0,0,1,2,2,1,2,27.724704,27.75088,...,25.879457,25.735601,12.703442,824.310737,819.728617,404.629193,599.842581,296.090455,294.444568,0.237476
237380,0,0,0,3,2,5,4,3,27.724704,39.602902,...,12.976401,32.545648,14.505548,284.171297,712.719873,317.658215,313.830305,139.873712,350.812258,0.441298


In [34]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lg = LGBMClassifier(n_jobs = -1, n_estimators=500)
lg.fit(Xtrain_transform, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [35]:
pred_test = lg.predict_proba(X_test_transform)[:, 1]
pred_train = lg.predict_proba(Xtrain_transform)[:, 1]
print("Скор на обучающей выборке: " + str(roc_auc_score(y_train, pred_train)))
print("Скор на тестовой выборке: " + str(roc_auc_score(y_test, pred_test)))

Скор на обучающей выборке: 0.8964384333123749
Скор на тестовой выборке: 0.7745324004557603


In [95]:
from hyperopt import hp, tpe, space_eval
from hyperopt.fmin import fmin

def function(params):
    params = {
        'learning_rate': params['learning_rate'], 
        'max_depth': params['max_depth'], 
        'subsample': params['subsample'], # Доля фичей
        'colsample_bytree': params['colsample_bytree'] # Доля объектов
    }
    
    print("############## RUN ################")
    print("params = {params}".format(params=params))
    
    LGBM = LGBMClassifier(
        n_jobs=-1, 
        n_estimators=500,
        verbose=200,
        **params)
    
    LGBM.fit(Xtrain_transform, y_train)
    pred = LGBM.predict_proba(X_test_transform)[:, 1]
    score = roc_auc_score(y_test, pred)
    print("Score: {score}".format(score=str(score)))
    
    return -score

In [97]:
lgbm_space =  {
            'learning_rate': hp.quniform('learning_rate', 0.1, 0.5, 0.1),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
            'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1)
    }

In [98]:
%%time 

best = fmin(fn=function,
            space=lgbm_space,
            # tpe - Tree of Parzen Estimators (TPE)
            algo=tpe.suggest,
            max_evals=50
           )

############## RUN ################                 
params = {'subsample': 0.9, 'learning_rate': 0.30000000000000004, 'colsample_bytree': 0.7000000000000001, 'max_depth': 2}
Score: 0.7756181316460888                           
############## RUN ################                                          
params = {'subsample': 0.9, 'learning_rate': 0.4, 'colsample_bytree': 0.9, 'max_depth': 3}
Score: 0.7682477038991173                                                    
############## RUN ################                                          
params = {'subsample': 0.7000000000000001, 'learning_rate': 0.4, 'colsample_bytree': 0.7000000000000001, 'max_depth': 9}
Score: 0.748760428001362                                                     
############## RUN ################                                          
params = {'subsample': 0.9, 'learning_rate': 0.2, 'colsample_bytree': 0.8, 'max_depth': 2}
Score: 0.7761671248140934                                                    
#

In [99]:
best_params = space_eval(lgbm_space, best)
#best_params['max_depth'] = int(best_params['max_depth'])
best_params

{'colsample_bytree': 0.6000000000000001,
 'learning_rate': 0.1,
 'max_depth': 1,
 'subsample': 0.6000000000000001}

In [39]:
best_params = {'colsample_bytree': 0.6000000000000001,
 'learning_rate': 0.1,
 'max_depth': 1,
 'subsample': 0.6000000000000001}

In [40]:
%%time

LGBM = LGBMClassifier(
        n_jobs=-1, 
        n_estimators=5000,
        verbose=100,
        **best_params
)

LGBM.fit(Xtrain_transform, 
         y_train,
        eval_metric=['auc'],
        eval_set=[(X_test_transform, y_test)],
        early_stopping_rounds=100)

[1]	valid_0's auc: 0.703331	valid_0's binary_logloss: 0.598491
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.694438	valid_0's binary_logloss: 0.597258
[3]	valid_0's auc: 0.698942	valid_0's binary_logloss: 0.595287
[4]	valid_0's auc: 0.689653	valid_0's binary_logloss: 0.593811
[5]	valid_0's auc: 0.681721	valid_0's binary_logloss: 0.592643
[6]	valid_0's auc: 0.706484	valid_0's binary_logloss: 0.580388
[7]	valid_0's auc: 0.700588	valid_0's binary_logloss: 0.579897
[8]	valid_0's auc: 0.69656	valid_0's binary_logloss: 0.579136
[9]	valid_0's auc: 0.692181	valid_0's binary_logloss: 0.578732
[10]	valid_0's auc: 0.698158	valid_0's binary_logloss: 0.576487
[11]	valid_0's auc: 0.709957	valid_0's binary_logloss: 0.567896
[12]	valid_0's auc: 0.719338	valid_0's binary_logloss: 0.560347
[13]	valid_0's auc: 0.722355	valid_0's binary_logloss: 0.559228
[14]	valid_0's auc: 0.725388	valid_0's binary_logloss: 0.557982
[15]	valid_0's auc: 0.728735	valid_0's binary_loglos

[134]	valid_0's auc: 0.776399	valid_0's binary_logloss: 0.513501
[135]	valid_0's auc: 0.776396	valid_0's binary_logloss: 0.51353
[136]	valid_0's auc: 0.776413	valid_0's binary_logloss: 0.513515
[137]	valid_0's auc: 0.776439	valid_0's binary_logloss: 0.513566
[138]	valid_0's auc: 0.776442	valid_0's binary_logloss: 0.513662
[139]	valid_0's auc: 0.776592	valid_0's binary_logloss: 0.513439
[140]	valid_0's auc: 0.77659	valid_0's binary_logloss: 0.513469
[141]	valid_0's auc: 0.776761	valid_0's binary_logloss: 0.513235
[142]	valid_0's auc: 0.776772	valid_0's binary_logloss: 0.513225
[143]	valid_0's auc: 0.776816	valid_0's binary_logloss: 0.513194
[144]	valid_0's auc: 0.776626	valid_0's binary_logloss: 0.513557
[145]	valid_0's auc: 0.776646	valid_0's binary_logloss: 0.513678
[146]	valid_0's auc: 0.776655	valid_0's binary_logloss: 0.513672
[147]	valid_0's auc: 0.776655	valid_0's binary_logloss: 0.513719
[148]	valid_0's auc: 0.776801	valid_0's binary_logloss: 0.513505
[149]	valid_0's auc: 0.7768

In [37]:
%%time

test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')
test_transformed = transformer_pipe.transform(test)
    

CPU times: user 15.7 s, sys: 25.7 s, total: 41.4 s
Wall time: 33.1 s


In [38]:
test_transformed.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,ord_0,ord_1,ord_2,day,month,ord_1_mean,ord_0_mean,...,ord_5_bayes+day_bayes,ord_5_bayes+month_mean,ord_5_bayes+month_bayes,day_mean+day_bayes,day_mean+month_mean,day_mean+month_bayes,day_bayes+month_mean,day_bayes+month_bayes,month_mean+month_bayes,logit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300000,0,0,1,2,1,3,5,11,24.157809,33.575256,...,4.1156,27.893618,7.821422,147.655597,1000.740708,280.609555,194.216735,54.458734,369.095873,0.157204
300001,0,0,0,1,4,5,7,5,35.396896,27.75088,...,5.208623,13.362723,1.597319,417.364351,1070.748195,127.992352,387.261156,46.291431,118.760661,0.669516
300002,1,0,1,2,3,1,1,12,31.788113,33.575256,...,29.796529,42.987248,11.798215,824.310737,1189.227441,326.393562,870.226124,238.840944,344.574189,0.057692
300003,0,0,1,1,2,5,2,3,27.724704,27.75088,...,22.840268,24.590895,10.960126,836.331896,900.433843,401.322059,731.072399,325.837909,350.812258,0.536835
300004,0,1,1,3,5,5,4,11,40.438841,39.602902,...,4.833748,15.677796,4.396083,284.171297,921.682305,258.441432,405.842253,113.798922,369.095873,0.837705


In [41]:
y_preds = LGBM.predict_proba(test_transformed)[:,1] 
y_preds

array([0.16738353, 0.58389217, 0.06968382, ..., 0.4052507 , 0.61412092,
       0.17989489])

In [42]:
test = pd.read_csv('../data/cat_in_the_dat_test.csv')
test["target"] = y_preds
submission = test[["id", "target"]]
submission.head()

Unnamed: 0,id,target
0,300000,0.167384
1,300001,0.583892
2,300002,0.069684
3,300003,0.559533
4,300004,0.774823


In [43]:
submission.to_csv('../data/cat_in_the_dat_sub_2.csv', index=False)