In [102]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, ExtraTreesClassifier
# from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb
import lightgbm as lgb

In [143]:
from IPython.display import Audio, display

def allDone():
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))
## Insert whatever audio file you want above

In [3]:
train = pd.read_csv('data/train.csv', index_col='Unnamed: 0', sep='\t')
test = pd.read_csv('data/test.csv', index_col='Unnamed: 0', sep='\t')

xtrain = train.drop('0', axis=1)
ytrain = train['0']
xtest = test.drop('0', axis=1)
ytest = test['0']

In [92]:
def pred_to_file(pred):
    pred = pd.DataFrame(pred, columns=["_VAL_"])
    pred.to_csv('solution_3.csv', index_label="_ID_")

In [5]:
cols_to_drop = ['9', '140', '164', '11', '5', '129', '130', '137', '138', '141', '149', '150', '178', '186', '188',
                '192', '193', '291', '301', '303', '305', '152', '160', '191', '182', '185', '181', '172', '170',
                '157', '136', '135']

xtrain.drop(cols_to_drop, axis=1, inplace=True)
xtest.drop(cols_to_drop, axis=1, inplace=True)

In [6]:
X_train, X_holdout, y_train, y_holdout = train_test_split(xtrain, ytrain,
                                                    test_size=0.20,
                                                    random_state=42, stratify=ytrain)

In [7]:
kf = KFold(n_splits=10)
skf = StratifiedKFold(n_splits=10)

In [21]:
estimator = LogisticRegression(random_state=666)

params = {
    'C' : (0.1, 0.15, 0.2, 0.25, 0.5, 0.8, 1)
}

gs = GridSearchCV(
    estimator=estimator,
    param_grid=params,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

gs.fit(X=X_train,y=y_train)

best_lr = gs.best_estimator_
best_score = gs.best_score_

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.4min finished


In [23]:
print("Baseline = ", best_score)

Baseline =  0.7275376894594372


In [88]:
pred_to_file(best_lr.predict_proba(xtest)[:, 1])

### xgboost

In [8]:
params = {
    #default
    'objective': 'reg:logistic',
    'eta': 0.1,
    'silent': 1,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": 'auc'
}

xgb_train = xgb.DMatrix(xtrain, ytrain, feature_names=xtrain.columns)

In [10]:
results = xgb.cv(params,xgb_train,
                 num_boost_round=200,
                 early_stopping_rounds=10,
                 folds=skf, verbose_eval=1)

[0]	train-auc:0.708571+0.00546898	test-auc:0.679583+0.0062847
[1]	train-auc:0.728341+0.00466282	test-auc:0.695084+0.0080493
[2]	train-auc:0.737702+0.00199645	test-auc:0.701467+0.00764415
[3]	train-auc:0.742555+0.0025073	test-auc:0.704987+0.00635292
[4]	train-auc:0.747133+0.00321745	test-auc:0.707569+0.00700152
[5]	train-auc:0.751657+0.00380819	test-auc:0.7102+0.00850609
[6]	train-auc:0.755208+0.00197601	test-auc:0.711661+0.00596181
[7]	train-auc:0.759184+0.00272927	test-auc:0.712976+0.00621004
[8]	train-auc:0.762415+0.00243296	test-auc:0.713888+0.0053963
[9]	train-auc:0.765591+0.00288591	test-auc:0.714246+0.00527878
[10]	train-auc:0.768442+0.00292221	test-auc:0.714146+0.00480181
[11]	train-auc:0.771464+0.00327101	test-auc:0.715015+0.00491449
[12]	train-auc:0.77469+0.00307474	test-auc:0.716108+0.00455814
[13]	train-auc:0.777802+0.00327215	test-auc:0.716749+0.00418458
[14]	train-auc:0.780486+0.00329337	test-auc:0.717924+0.00418925
[15]	train-auc:0.784232+0.00307153	test-auc:0.718966+0.00

In [61]:
params = {
    #default
    'objective': 'reg:logistic',
    'eta': 0.05,
    'silent': 1,
    "nthread": -1,
    "random_seed": 666,
    "eval_metric": 'auc',

    # regularization parameters
#     'max_leavs': 16,
#     'max_depth': 6,
    'gamma': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    
    #lightgbm approach
    'tree_method': 'hist'
}

results = xgb.cv(params,xgb_train,
                 num_boost_round=1000,
                 early_stopping_rounds=18,
                 folds=skf, verbose_eval=1)

[0]	train-auc:0.69543+0.00412001	test-auc:0.674409+0.0081172
[1]	train-auc:0.714987+0.00394338	test-auc:0.690508+0.00213083
[2]	train-auc:0.722343+0.00444996	test-auc:0.695188+0.00585077
[3]	train-auc:0.726942+0.00142743	test-auc:0.701528+0.00283914
[4]	train-auc:0.729756+0.00199823	test-auc:0.704376+0.00314715
[5]	train-auc:0.732566+0.0011085	test-auc:0.706236+0.00445656
[6]	train-auc:0.734346+0.000920211	test-auc:0.707268+0.00484968
[7]	train-auc:0.735829+0.000514441	test-auc:0.708117+0.00505061
[8]	train-auc:0.736871+0.000395667	test-auc:0.70883+0.00436885
[9]	train-auc:0.738099+0.000306364	test-auc:0.709714+0.0052724
[10]	train-auc:0.740508+0.000866488	test-auc:0.710616+0.00517437
[11]	train-auc:0.741294+0.000721944	test-auc:0.711154+0.00475873
[12]	train-auc:0.742619+0.00108132	test-auc:0.711367+0.0053606
[13]	train-auc:0.743249+0.00114331	test-auc:0.712102+0.00543585
[14]	train-auc:0.743972+0.00147611	test-auc:0.71247+0.00577115
[15]	train-auc:0.745182+0.00173482	test-auc:0.71287

[128]	train-auc:0.829049+0.00227155	test-auc:0.73704+0.00151919
[129]	train-auc:0.829656+0.00213083	test-auc:0.737075+0.00162046
[130]	train-auc:0.829729+0.00222016	test-auc:0.737056+0.00164355
[131]	train-auc:0.830154+0.0020813	test-auc:0.737026+0.00162532
[132]	train-auc:0.830593+0.00229673	test-auc:0.737091+0.00161111
[133]	train-auc:0.830954+0.00234417	test-auc:0.737223+0.00163189
[134]	train-auc:0.831234+0.00231029	test-auc:0.737254+0.00160296
[135]	train-auc:0.831608+0.00234277	test-auc:0.737339+0.00167212
[136]	train-auc:0.831863+0.00237846	test-auc:0.737269+0.0016337
[137]	train-auc:0.832069+0.00239619	test-auc:0.737291+0.00161998
[138]	train-auc:0.832363+0.00254071	test-auc:0.737251+0.00160275
[139]	train-auc:0.832861+0.00203977	test-auc:0.737317+0.00165906
[140]	train-auc:0.833266+0.00214668	test-auc:0.737268+0.00175135
[141]	train-auc:0.833756+0.00209252	test-auc:0.737269+0.00180646
[142]	train-auc:0.834207+0.00205271	test-auc:0.737394+0.0017391
[143]	train-auc:0.834454+0.00

[255]	train-auc:0.862655+0.000493359	test-auc:0.739407+0.00238544
[256]	train-auc:0.86292+0.000415125	test-auc:0.73945+0.00238609
[257]	train-auc:0.863177+0.000629634	test-auc:0.739431+0.00240881
[258]	train-auc:0.863256+0.000612998	test-auc:0.739431+0.00242435
[259]	train-auc:0.863605+0.000550098	test-auc:0.739428+0.00229889
[260]	train-auc:0.863716+0.000678472	test-auc:0.739422+0.00233209
[261]	train-auc:0.863884+0.000755638	test-auc:0.739432+0.00237783
[262]	train-auc:0.864043+0.000810165	test-auc:0.739412+0.00233229
[263]	train-auc:0.86438+0.000941329	test-auc:0.739433+0.00236172
[264]	train-auc:0.864698+0.000890465	test-auc:0.73938+0.00231399
[265]	train-auc:0.864889+0.000956723	test-auc:0.739336+0.00230989


0.736215 = maxd_6, gamma_8

0.738909

In [101]:
srav = xgb.XGBClassifier(**params)
srav.fit(xtrain, ytrain)
pred_to_file(srav.predict_proba(xtest)[:, 1])

Валидация: 0.739336

Лидерборд: 0.73931072

### lightgbm

In [107]:
params = {
    'objective': 'binary',
    'learning_rate': 0.1,
    'num_threads': 4,
    "metric": 'auc',
    'random_seed': 666,
}
n_rounds = 10000

lgb_train = lgb.Dataset(xtrain, label=ytrain, free_raw_data=False)

In [105]:
result = lgb.cv(params,
                lgb_train,
                n_rounds,
                folds=skf.split(xtrain, ytrain),
                early_stopping_rounds=10,
                verbose_eval=1)

[1]	cv_agg's auc: 0.686362 + 0.0102095
[2]	cv_agg's auc: 0.694582 + 0.0102184
[3]	cv_agg's auc: 0.698598 + 0.00900459
[4]	cv_agg's auc: 0.700617 + 0.00866554
[5]	cv_agg's auc: 0.703707 + 0.00742721
[6]	cv_agg's auc: 0.708083 + 0.00831141
[7]	cv_agg's auc: 0.709926 + 0.00937122
[8]	cv_agg's auc: 0.711865 + 0.00918754
[9]	cv_agg's auc: 0.713788 + 0.00988344
[10]	cv_agg's auc: 0.715976 + 0.00935391
[11]	cv_agg's auc: 0.71732 + 0.00861337
[12]	cv_agg's auc: 0.718068 + 0.00921695
[13]	cv_agg's auc: 0.719177 + 0.00923234
[14]	cv_agg's auc: 0.720281 + 0.00898898
[15]	cv_agg's auc: 0.721359 + 0.00895576
[16]	cv_agg's auc: 0.722496 + 0.00864592
[17]	cv_agg's auc: 0.724102 + 0.00897849
[18]	cv_agg's auc: 0.724801 + 0.00895238
[19]	cv_agg's auc: 0.726207 + 0.00868798
[20]	cv_agg's auc: 0.727395 + 0.00849264
[21]	cv_agg's auc: 0.728462 + 0.00842135
[22]	cv_agg's auc: 0.729353 + 0.00894822
[23]	cv_agg's auc: 0.730253 + 0.00889804
[24]	cv_agg's auc: 0.730941 + 0.00840724
[25]	cv_agg's auc: 0.731411 

In [106]:
def check_train_score(params, lgb_data, data, target, kf, num_rounds):
    roc_auc = []
    for train, val in kf.split(data,target):
        temp_lgb_train = lgb_data.subset(train)
        temp_lgb_val = lgb_data.subset(val)
        temp_model = lgb.train(params, temp_lgb_train, num_rounds, verbose_eval=num_rounds)
        roc_auc.append([roc_auc_score(target[train],temp_model.predict(data.loc[train])), roc_auc_score(target[val],temp_model.predict(data.loc[val]))])
    return np.mean(roc_auc, axis=0)

In [108]:
check_train_score(params, lgb_train, xtrain, ytrain, skf, 63)

array([0.84364971, 0.74128117])

In [174]:
params = {
    #default
    'objective': 'binary',
    'learning_rate': 0.05,
    'num_threads': -1,
    "metric": 'auc',
    'random_seed': 666,

    #regularization
#     'l2-leaf-reg': 6,
    'colsample_bytree': 0.6,
    'subsample': 0.6,
    'subsample_freq': 1
}

result = lgb.cv(params, lgb_train, n_rounds, folds=skf.split(xtrain, ytrain),
                early_stopping_rounds=10, verbose_eval=1)

[1]	cv_agg's auc: 0.673938 + 0.011513
[2]	cv_agg's auc: 0.688957 + 0.0122199
[3]	cv_agg's auc: 0.696471 + 0.0114683
[4]	cv_agg's auc: 0.699559 + 0.010747
[5]	cv_agg's auc: 0.704846 + 0.00984765
[6]	cv_agg's auc: 0.707711 + 0.00987932
[7]	cv_agg's auc: 0.709628 + 0.00889804
[8]	cv_agg's auc: 0.711373 + 0.00909009
[9]	cv_agg's auc: 0.71227 + 0.00869173
[10]	cv_agg's auc: 0.713157 + 0.00797793
[11]	cv_agg's auc: 0.714049 + 0.00821536
[12]	cv_agg's auc: 0.714752 + 0.00877605
[13]	cv_agg's auc: 0.715498 + 0.00803839
[14]	cv_agg's auc: 0.71641 + 0.00812718
[15]	cv_agg's auc: 0.71707 + 0.0083202
[16]	cv_agg's auc: 0.717688 + 0.00832247
[17]	cv_agg's auc: 0.718024 + 0.00877073
[18]	cv_agg's auc: 0.71873 + 0.00856991
[19]	cv_agg's auc: 0.719576 + 0.00872096
[20]	cv_agg's auc: 0.720089 + 0.00869216
[21]	cv_agg's auc: 0.720568 + 0.00851186
[22]	cv_agg's auc: 0.721616 + 0.0085683
[23]	cv_agg's auc: 0.72246 + 0.00848291
[24]	cv_agg's auc: 0.723118 + 0.00841485
[25]	cv_agg's auc: 0.723361 + 0.008551

In [172]:
score = check_train_score(params, lgb_train, xtrain, ytrain, skf, len(result['auc-mean']))
print("Score = ", score)

Score =  [0.87054079 0.74361692]


In [173]:
allDone()

[0.87654715, 0.74290873] - basic

[0.87054079 0.74361692] - reg 0.6

0.74361692 = gamma 7

In [175]:
srav = lgb.LGBMClassifier(**params)
srav.fit(xtrain, ytrain)
pred_to_file(srav.predict_proba(xtest)[:, 1])

In [128]:
lgb.LGBMClassifier?

In [131]:
len(result['auc-mean'])

92

In [None]:
cat_feats = np.where(train.dtypes == 'object')[0].tolist()