# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMClassifier
from skopt import dummy_minimize
from skopt import gp_minimize



from sklearn import impute
from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import optuna


import time
import random
import warnings

warnings.filterwarnings('ignore')




## reduce_memory_usage

In [2]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

# Looking Data

In [3]:
test = pd.read_csv("test.csv").drop(columns = ["id"])
test = reduce_memory_usage(test)


Mem. usage decreased to 250.34 Mb (77.0% reduction)


In [4]:
train = pd.read_csv("train.csv").drop(columns = ["id"])
train = reduce_memory_usage(train)



Mem. usage decreased to 501.63 Mb (77.0% reduction)


In [5]:
submission = pd.read_csv("sample_submission.csv")
submission = reduce_memory_usage(submission)

Mem. usage decreased to 2.86 Mb (62.5% reduction)


In [6]:
target = train['target']

In [7]:
N_FOLDS = 5

## Checking and Preparing Data

In [8]:
target = 'target'
features = [col for col in train.columns if col != target]

len(features)

285

## Missing Values

In [9]:
'''train['n_missing'] = train[features].isna().sum(axis=1)
test['n_missing'] = test[features].isna().sum(axis=1)

features.append('n_missing')

modes = train[features].mode().iloc[0]

train[features] = train[features].fillna(modes)
test[features] = test[features].fillna(modes)'''

"train['n_missing'] = train[features].isna().sum(axis=1)\ntest['n_missing'] = test[features].isna().sum(axis=1)\n\nfeatures.append('n_missing')\n\nmodes = train[features].mode().iloc[0]\n\ntrain[features] = train[features].fillna(modes)\ntest[features] = test[features].fillna(modes)"

## Scaler the Data
### Standardize features by removing the mean and scaling to unit variance

In [10]:
train['min_row'] = train[features].min(axis=1)
train['max_row'] = train[features].max(axis=1)
train['mean_row'] = train[features].mean(axis=1)
train['std_row'] = train[features].std(axis=1)

test['min_row'] = test[features].min(axis=1)
test['max_row'] = test[features].max(axis=1)
test['mean_row'] = test[features].mean(axis=1)
test['std_row'] = test[features].std(axis=1)

features += ['min_row', 'max_row', 'mean_row', 'std_row']

In [11]:
bool_cols_train = []
for i, col in enumerate(train.columns):
    if train[col].dtypes == bool:
        bool_cols_train.append(i)



In [12]:
bool_cols_test = []
for i, col in enumerate(test.columns):
    if train[col].dtypes == bool:
        bool_cols_test.append(i)

In [13]:
train.iloc[:, bool_cols_train] = train.iloc[:, bool_cols_train].astype(int)
test.iloc[:, bool_cols_test] = test.iloc[:, bool_cols_test].astype(int)

In [14]:
X_train = train.drop('target', axis=1).copy()
X = train.drop('target', axis=1).copy()
y = train['target'].copy()
X_test = test.copy()

del train
del test

# Parameters 

In [15]:
N_SPLITS = 5
N_ESTIMATORS = 500
VERBOSE = 50

## Random Search

In [16]:
XRtrain, XRtest, yrtrain, yrtest = train_test_split(X_train, y, train_size = 0.20)

In [17]:
def model_training(params):
    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    
    print (params, '\n')
    
    model = LGBMClassifier()
    model.fit(XRtrain, yrtrain)
    
    p = model.predict_proba(XRtest)[:,1]
    
    return - roc_auc_score(yrtest,p)



space = [(1e-3, 1e-1, "log-uniform"),
         (2,135),
         (1,100),
         (0.05, 1.0),
         (0.1 , 1.0)]

resultado = dummy_minimize(model_training, space, random_state = 42, verbose = 50, n_calls = 100)

Iteration No: 1 started. Evaluating function at random point.
[0.03918194347141743, 16, 72, 0.6187255599871848, 0.2404167763981929] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 23.5405
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 2 started. Evaluating function at random point.
[0.002051110418843398, 76, 75, 0.8728673384861885, 0.6410035105688879] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 23.2991
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 3 started. Evaluating function at random point.
[0.026070247583707674, 23, 53, 0.9714143595538948, 0.8491983767203797] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 25.4977
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 4 started. Evaluating function at random point.
[0.0026587543983272693, 65, 60, 0.22423428436076215, 0.373818018663584] 

Iteration No: 4 ended. Evaluation done at random point.
Ti

Iteration No: 31 ended. Evaluation done at random point.
Time taken: 22.1116
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 32 started. Evaluating function at random point.
[0.024258953709486806, 107, 44, 0.2819192461353855, 0.25146193795563754] 

Iteration No: 32 ended. Evaluation done at random point.
Time taken: 23.0187
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 33 started. Evaluating function at random point.
[0.002738598961585681, 89, 52, 0.39544812226032927, 0.9746038744488648] 

Iteration No: 33 ended. Evaluation done at random point.
Time taken: 22.1612
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 34 started. Evaluating function at random point.
[0.08411909465645724, 127, 39, 0.5223860805977663, 0.37079047883509275] 

Iteration No: 34 ended. Evaluation done at random point.
Time taken: 22.0874
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 35 started. Evaluating function at

Iteration No: 62 ended. Evaluation done at random point.
Time taken: 22.1813
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 63 started. Evaluating function at random point.
[0.0015644186602631934, 33, 47, 0.9905511575990502, 0.22607561371287166] 

Iteration No: 63 ended. Evaluation done at random point.
Time taken: 22.1811
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 64 started. Evaluating function at random point.
[0.010880761845917639, 3, 90, 0.7537301868664943, 0.7273141668957412] 

Iteration No: 64 ended. Evaluation done at random point.
Time taken: 22.5477
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 65 started. Evaluating function at random point.
[0.025407864693711604, 49, 39, 0.3289122520512687, 0.8284250399306623] 

Iteration No: 65 ended. Evaluation done at random point.
Time taken: 22.3693
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 66 started. Evaluating function at ra

Iteration No: 93 ended. Evaluation done at random point.
Time taken: 22.2349
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 94 started. Evaluating function at random point.
[0.04108369330865427, 50, 99, 0.44198679306585514, 0.4348162772135049] 

Iteration No: 94 ended. Evaluation done at random point.
Time taken: 22.7753
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 95 started. Evaluating function at random point.
[0.03571296609641059, 25, 23, 0.9342194593233866, 0.8725714766587107] 

Iteration No: 95 ended. Evaluation done at random point.
Time taken: 22.1143
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 96 started. Evaluating function at random point.
[0.007210876454272621, 14, 23, 0.7668157303804484, 0.19281148195233938] 

Iteration No: 96 ended. Evaluation done at random point.
Time taken: 22.2073
Function value obtained: -0.8487
Current minimum: -0.8487
Iteration No: 97 started. Evaluating function at ran

In [18]:
resultado.x

[0.03918194347141743, 16, 72, 0.6187255599871848, 0.2404167763981929]

# Model  LGB

In [19]:
lgbm_params_rs = {
    "objective": "binary",
    "learning_rate": resultado.x[0],
    'n_estimators': 1200,
    'num_leaves': resultado.x[1],
    'min_child_samples': resultado.x[2],
    'subsample':resultado.x[3],
    'colsample_bytree': resultado.x[3]
}

In [20]:
model = LGBMClassifier(**lgbm_params_rs)
model

LGBMClassifier(colsample_bytree=0.6187255599871848,
               learning_rate=0.03918194347141743, min_child_samples=72,
               n_estimators=1000, num_leaves=16, objective='binary',
               subsample=0.6187255599871848)

## Cross Validate

In [21]:
splits = 10
skf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

oof_preds = np.zeros((X.shape[0],))
preds = 0
model_fi = 0
total_mean_auc = 0

for num, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
    X_train, X_valid = X.loc[train_idx], X.loc[valid_idx]
    y_train, y_valid = y.loc[train_idx], y.loc[valid_idx]
    
    imputer = SimpleImputer(strategy='median')
    X_train = imputer.fit_transform(X_train)
    X_valid = imputer.transform(X_valid)
    
    model.fit(X_train, y_train,
              verbose=False,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric="auc",
              early_stopping_rounds=200,
              )
    
    X_test = imputer.transform(X_test)
    
    preds += model.predict_proba(X_test)[:, 1] / splits
   
   
    
    model_fi += model.feature_importances_ / splits
    
    oof_preds[valid_idx] = model.predict_proba(X_valid)[:, 1]
    
    fold_auc = roc_auc_score(y_valid, oof_preds[valid_idx])
    print(f"Fold {num} ROC AUC: {fold_auc}")

    total_mean_auc += fold_auc / splits
    
print(f"\nOverall ROC AUC: {total_mean_auc}")

Fold 0 ROC AUC: 0.8576343457481559
Fold 1 ROC AUC: 0.8543535826612859
Fold 2 ROC AUC: 0.8562044869028016
Fold 3 ROC AUC: 0.8550051130743109
Fold 4 ROC AUC: 0.8556868097157193

Overall ROC AUC: 0.8557768676204548


In [22]:
lgbmscore = total_mean_auc
lgbmscore

0.8557768676204548

#### Submission

In [23]:
submission.target = preds
submission.head()


Unnamed: 0,id,target
0,1000000,0.749457
1,1000001,0.279005
2,1000002,0.892444
3,1000003,0.832792
4,1000004,0.262579


## Submission

In [25]:
#submission = pd.read_csv("sample_solution.csv")
#submission.target = preds
submission.to_csv('submissionlgb.csv', index=False)