In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

In [None]:
noize = np.random.uniform(size=5)
noize

In [2]:
train = pd.read_csv('../data/cat_in_the_dat_train.csv', index_col='id')
test = pd.read_csv('../data/cat_in_the_dat_test.csv', index_col='id')

In [3]:
train.head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [4]:
Y = train['target']
X = train.drop('target', axis=1)

In [5]:
len(X.columns)

23

In [8]:
class CombinationEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features):
        self.cat_features = cat_features

    def fit(self, df, target=None):
        return self

    def transform(self, df):
        from itertools import combinations
        res = df
        for comb in combinations(self.cat_features, 2):
            res[comb[0]+'+'+comb[1]] = res[comb[0]].map(str) + '+' + res[comb[1]].map(str)
        return res        

In [9]:
class SeasonEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, month_num_col):
        self.month_num_col = month_num_col

    def __get_season(self, month_num):
        if month_num in [12, 1, 2]:
            return 'winter'
        elif month_num in [3, 4, 5]:
            return 'spring'
        elif month_num in [6, 7, 8]:
            return 'summer'
        elif month_num in [9, 10, 11]:
            return 'autumn'
        else:
            return 'error'

    def fit(self, df, target=None):
        return self

    def transform(self, df):
        res = df
        res['season'] = res[self.month_num_col].apply(self.__get_season)
        return res      

In [10]:
class CountEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def fit(self, df, target=None):
        for col in self.cat_cols:
            mapping = df[col].value_counts()
            mapping['nan'] = df[col].isnull().sum()
            self.encoded[col] = dict(mapping)
        return self

    def transform(self, df):
        res = df
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, mapping in self.encoded.items():
            res[col+'_counter'] = res[col].map(mapping)
        return res

In [11]:
class MeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def fit(self, df, target=None):
        df_ = df
        df_[self.cat_cols].fillna('nan', inplace=True)
        df_['target'] = target
        for col in self.cat_cols:
            mapping = df_.groupby(col)['target'].mean()
            self.encoded[col] = dict(mapping)
        df_.drop('target', axis=1, inplace=True)
        return self

    def transform(self, df):
        res = df
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, mapping in self.encoded.items():
            res[col+'_mean'] = res[col].map(mapping)
        return res

In [12]:
class BayesEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def fit(self, df, target=None):
        df_ = df
        df_[self.cat_cols].fillna('nan', inplace=True)
        df_['target'] = target
        global_pos_target = df_['target'].mean()
        for col in self.cat_cols:
            global_count = df_[col].count()
            means = df_.groupby(col)['target'].mean()
            probs = df_[col].value_counts()/global_count
            mapping = (means*probs)/global_pos_target
            self.encoded[col] = dict(mapping)
        df_.drop('target', axis=1, inplace=True)
        return self

    def transform(self, df):
        res = df.copy()
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, mapping in self.encoded.items():
            res[col+'_bayes'] = res[col].map(mapping)
        return res

In [13]:
class RareEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        self.encoded = dict()

    def __set_value(self, col, treshold):
        if col < treshold:
            return 0
        else:
            return col

    def fit(self, df, target=None):
        df_ = df.copy()
        df_[self.cat_cols].fillna('nan', inplace=True)
        for col in self.cat_cols:
            treshold = df_[col].quantile(0.25)
            self.encoded[col] = treshold
        return self

    def transform(self, df):
        res = df.copy()
        res[self.cat_cols].fillna('nan', inplace=True)
        for col, treshold in self.encoded.items():
            res[col] = res[col].apply(self.__set_value, treshold=treshold)
        return res

In [14]:
from itertools import combinations

features = list(X.columns)
for comb in combinations(X.columns, 2):
    f = comb[0]+'+'+comb[1]
    features.append(f)

#features = X.columns

feat = []
for col in list(features):
    if '+' in col:
        #feat.append(col+'_counter')
        #feat.append(col+'_bayes')
        feat.append(col+'_mean')

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.2, 
                                                    random_state=17)

In [16]:
from sklearn.pipeline import make_pipeline

combination = CombinationEncoder(X.columns)
season = SeasonEncoder('month')
count = CountEncoder(features)
mean = MeanEncoder(features)
bayes = BayesEncoder(features)
rare = RareEncoder(feat)

'''
transformer_pipe = make_pipeline(combination,
                                 season,
                                 count,
                                 mean,
                                 bayes,
                                 rare)
'''
transformer_pipe = make_pipeline(combination,
                                 season,
                                 mean,
                                 rare)

Xtrain_transform = transformer_pipe.fit_transform(X_train, y_train)

len(Xtrain_transform.columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '

553

In [17]:
Xtrain_transform.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240000 entries, 169336 to 64753
Columns: 553 entries, bin_0 to nom_8+day_mean
dtypes: float64(276), int64(6), object(271)
memory usage: 1014.4+ MB


In [None]:
%%time

Xtrain_transform = Xtrain_transform.select_dtypes(include=['int64', 'float64']).fillna(0)
#Xtrain_transform.to_csv('../data/cat_in_the_dat_train_transform.csv')

CPU times: user 1.16 s, sys: 1.04 s, total: 2.19 s
Wall time: 2.19 s


In [None]:
%%time

X_test_transform = transformer_pipe.transform(X_test).select_dtypes(include=['int64', 'float64']).fillna(0)
#X_test_transform.to_csv('../data/cat_in_the_dat_test_transform.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
print(len(Xtrain_transform.columns))
print(len(X_test_transform.columns))

In [None]:
X_test_transform.head()

In [None]:
feat = []


for col in list(X_test_transform.columns):
    if '_counter' in col:
        feat.append(col)
    elif 'season' in col:
        feat.append(col)
    elif '_mean' in col:
        feat.append(col)
    elif '_bayes' in col:
        feat.append(col)

len(feat)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score

lg = LGBMClassifier(n_jobs = -1, n_estimators=500)
lg.fit(Xtrain_transform[feat], y_train)

In [None]:
pred_test = lg.predict_proba(X_test_transform[feat])[:, 1]
pred_train = lg.predict_proba(Xtrain_transform[feat])[:, 1]
print("Скор на обучающей выборке: " + str(roc_auc_score(y_train, pred_train)))
print("Скор на тестовой выборке: " + str(roc_auc_score(y_test, pred_test)))

bayes
Скор на обучающей выборке: 0.8322585823311461
Скор на тестовой выборке: 0.7635748176349875

mean
Скор на обучающей выборке: 0.8734181185441837
Скор на тестовой выборке: 0.7748761375678637

counter
Скор на обучающей выборке: 0.8140289870128897
Скор на тестовой выборке: 0.7684248770966747

all
Скор на обучающей выборке: 0.8797801294147838
Скор на тестовой выборке: 0.7747975750564811


In [39]:
from hyperopt import hp, tpe, space_eval
from hyperopt.fmin import fmin

def function(params):
    params = {
        'learning_rate': params['learning_rate'], 
        'max_depth': params['max_depth'], 
        'subsample': params['subsample'], # Доля фичей
        'colsample_bytree': params['colsample_bytree'] # Доля объектов
    }
    
    print("############## RUN ################")
    print("params = {params}".format(params=params))
    
    LGBM = LGBMClassifier(
        n_jobs=-1, 
        n_estimators=500,
        verbose=200,
        **params)
    
    LGBM.fit(Xtrain_transform, y_train)
    pred = LGBM.predict_proba(X_test_transform)[:, 1]
    score = roc_auc_score(y_test, pred)
    print("Score: {score}".format(score=str(score)))
    
    return -score

In [40]:
lgbm_space =  {
            'learning_rate': hp.quniform('learning_rate', 0.1, 0.5, 0.1),
            'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
            'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
            'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.1)
    }

In [41]:
best = fmin(fn=function,
            space=lgbm_space,
            # tpe - Tree of Parzen Estimators (TPE)
            algo=tpe.suggest,
            max_evals=50
           )

############## RUN ################                 
params = {'subsample': 0.9, 'learning_rate': 0.4, 'colsample_bytree': 0.6000000000000001, 'max_depth': 11}
Score: 0.5147340225513882                           
############## RUN ################                                       
params = {'subsample': 0.8, 'learning_rate': 0.1, 'colsample_bytree': 0.9, 'max_depth': 4}
Score: 0.5297093997600894                                                 
############## RUN ################                                       
params = {'subsample': 1.0, 'learning_rate': 0.2, 'colsample_bytree': 0.9, 'max_depth': 8}
Score: 0.5286196969463924                                                
############## RUN ################                                      
params = {'subsample': 0.5, 'learning_rate': 0.5, 'colsample_bytree': 0.6000000000000001, 'max_depth': 7}
Score: 0.5237026489147316                                                
############## RUN ################                 

In [42]:
best_params = space_eval(lgbm_space, best)
#best_params['max_depth'] = int(best_params['max_depth'])
best_params

{'colsample_bytree': 0.8,
 'learning_rate': 0.30000000000000004,
 'max_depth': 5,
 'subsample': 0.9}

In [None]:
with open('../data/cat_in_the_dat_V1.dill', 'wb') as f:
        dill.dump(pipe, f)