In [16]:
import sklearn

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import lightgbm as lgbm
import xgboost as xgb
import catboost as cb

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder, MinMaxScaler

from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

In [17]:
from sklearn import preprocessing

data = pd.read_csv('../data/data_real_train.csv', sep=';')

categorical_features = []

for cat in categorical_features:
    lbl = preprocessing.LabelEncoder()
    data[cat] = lbl.fit_transform(data[cat].astype(str))
    data[cat] = data[cat].astype('category')
    
data.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 1501 entries, 0 to class
dtypes: float64(1500), int64(1)
memory usage: 11.5 MB


In [18]:
class_names = np.unique(data['class'])
data['class'] = data['class'].replace(class_names, np.arange(data['class'].nunique()))

In [19]:
cols2drop = ['class']
categorical_features = []
numerical_features = [c for c in data.columns if c not in categorical_features and c not in cols2drop]

In [20]:
X_train, X_val, y_train, y_val = train_test_split(data.drop(cols2drop, axis=1), 
                                                    data['class'],
                                                    test_size=.25,
                                                    stratify=data['class'],
                                                    random_state=42)

In [21]:
params_cat = {
             'n_estimators' : 200,
              # 'learning_rate': .03,
              'depth' : 3,
              'verbose': False,
              'use_best_model': True,
              'cat_features' : categorical_features,
              'text_features': [],
              # 'train_dir' : '/home/jovyan/work/catboost',
              'border_count' : 64,
              'l2_leaf_reg' : 1,
              'bagging_temperature' : 2,
              'rsm' : 0.51,
              'loss_function': 'MultiClass',
              'auto_class_weights' : 'Balanced', #try not balanced
              'random_state': 42,
              'use_best_model': False,
              # 'custom_metric' : ['AUC', 'MAP'] # Не работает внутри sklearn.Pipelines
         }

cat_model = cb.CatBoostClassifier(**params_cat)

In [22]:
categorical_features_index = [i for i in range(data.shape[1]) if data.columns[i] in categorical_features]
params_lgbm = {
    "num_leaves": 200,
    "n_estimators": 200,
    # "max_depth": 7,
    "min_child_samples": None,
    "learning_rate": 0.001,
    "min_data_in_leaf": 5,
    "feature_fraction": 0.98,
    # "categorical_feature": cat_cols,
    'reg_alpha' : 3.0,
    'reg_lambda' : 5.0,
    'categorical_feature': categorical_features_index
}

lgbm_model = lgbm.LGBMClassifier(**params_lgbm)

In [23]:
params_xgb = {
    "eta": 0.05,
    'n_estimators' : 200,
    "max_depth": 6,
    "subsample": 0.7,
    # "colsample_bytree": 0.95,
    'min_child_weight' : 0.1,
    'gamma': .01,
    'reg_lambda' : 0.1,
    'reg_alpha' : 0.5,
    "objective": "reg:linear",
    "eval_metric": "mae",
    'tree_method' : 'hist', # Supported tree methods for cat fs are `gpu_hist`, `approx`, and `hist`.
    'enable_categorical' : True
    
}

xgb_model = xgb.XGBClassifier(**params_xgb)

In [24]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

In [25]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])

In [26]:
preprocessor = ColumnTransformer(transformers=[
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features)])

In [43]:
estimators = [
    
    
    ("ExtraTrees",  make_pipeline(preprocessor, ExtraTreesClassifier(n_estimators = 400, max_depth = 6, min_samples_leaf = 2, 
                                                              bootstrap = True, class_weight = 'balanced', # ccp_alpha = 0.001, 
                                                              random_state = 75, verbose=False, n_jobs=-1,))),
    

    ("XGBoost", xgb_model),
#     ("LightGBM", lgbm_model),
    ("CatBoost", cat_model),
    
    # То, что не дало прироста в ансамбле
    # ("SVM", make_pipeline(preprocessor, LinearSVC(verbose=False))),
    # ("MLP", make_pipeline(preprocessor, MLPClassifier(verbose=False, hidden_layer_sizes=(100, 30, ), alpha=0.001,random_state=75, max_iter = 1300, ))),
#     ("Random_forest",  make_pipeline(preprocessor, RandomForestClassifier(n_estimators = 100, max_depth = 7, 
#                                                               min_samples_leaf = 2,
#                                                               warm_start = True, n_jobs=-1,
#                                                               random_state = 75, verbose=False))),
    
    
    
]

# в качестве мета-модели будем использовать LogisticRegression
meta_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(verbose=False),
    # final_estimator=RandomForestClassifier(n_estimators = 10_000, 
                                           # max_depth = 5,
                                           # verbose=False),
    n_jobs=-1,
    verbose=False,
)

stacking_classifier = meta_model

In [44]:
stacking_classifier.fit(X_train, y_train)



In [45]:
test = pd.read_csv('../data/data_real_test.csv', sep=';')

In [46]:
test_predict = pd.DataFrame(stacking_classifier.predict(test), columns=['class'])

In [47]:
''.join([str(i) for i in list(stacking_classifier.predict(test))])

'01100001000000000010000000010100000001100011000000'

In [48]:
test_predict['class'].value_counts()[1] / test_predict['class'].value_counts()[0]

0.25

In [49]:
'01000000001000000100000001010000100001000000010001'

'01000000001000000100000001010000100001000000010001'

