In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)  
import tensorflow as tf
import scipy
import seaborn as sns 

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold 
from sklearn import metrics 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

import lightgbm as lgb
from lightgbm import LGBMClassifier

import gc
import random
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
version = 10
basic_name = f'Santander_v{version}'
save_model_name = basic_name + '.model'

print(basic_name)

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv('../input/test.csv')
#train_df_rand = train_df.copy()
train_df_ones = train_df.loc[train_df['target'] == 1]

In [None]:
for feature in train_df.columns.values[2:102]:
    #train_df_rand[feature] = (train_df_rand[feature])*random.uniform(0.99, 1.01)
    train_df_ones[feature] = (train_df_ones[feature])*random.uniform(0.99, 1.01)

#train_df = pd.concat([train_df, train_df_rand, train_df_ones]) 
#del train_df_rand, train_df_ones

train_df = pd.concat([train_df, train_df_ones])
del train_df_ones

In [None]:
X_train = train_df.drop(["target","ID_code"], axis=1).values
y_train = train_df["target"].values
X_test = test_df.drop("ID_code",axis=1).values

In [None]:
def shuffle(x, y, t=2):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn]) 
    return x,y

In [None]:
SEED = 42
NFOLDS = 5
NSHUFFLES = 3

params_tuned = {
        'bagging_freq': 5,
        'bagging_fraction': 0.38,
        'boost_from_average':'false',
        'boost': 'gbdt',
        'feature_fraction': 0.045,
        'learning_rate': 0.0075,
        'max_depth': -1,  
        'metric':'auc',
        'min_data_in_leaf': 80,
        'min_sum_hessian_in_leaf': 10.0,
        'num_leaves': 13,
        'num_threads': 8,
        'tree_learner': 'serial',
        'objective': 'binary', 
        'verbosity': 1
}


def lgbm_kfold(parameters=0, folds=2, shuffles=1, rounds=100, xtrain=None, ytrain=None, xtest=None, seed=0):
    
    kfold = StratifiedKFold(n_splits = folds, shuffle=True, random_state=seed)
            
    fold_test = np.zeros((xtest.shape[0],))
    fold_test_shf = np.empty((shuffles, xtest.shape[0]))
    fold_test_skf = np.empty((folds, xtest.shape[0]))
    
    for i, (train_index, test_index) in enumerate(kfold.split(xtrain, ytrain)):
        for j in range(shuffles):
            print("Fold:",i ,"Shuffle", j) 
            x_train, y_train = shuffle(xtrain[train_index], ytrain[train_index])
            train_data = lgb.Dataset(x_train, label=y_train)
            valid_data = lgb.Dataset(xtrain[test_index], label=ytrain[test_index])
        
            model = lgb.train(parameters, train_data, rounds, valid_sets = [train_data, valid_data], verbose_eval=1000, early_stopping_rounds = 4000)
        
            fold_test_shf[j, :] = model.predict(xtest, num_iteration=model.best_iteration)
            
        fold_test_skf[i, :] = fold_test_shf.mean(axis=0) 
        #fold_test_skf[i, :] = model.predict(xtest, num_iteration=model.best_iteration) 

    fold_test[:] = fold_test_skf.mean(axis=0) 
    
    return fold_test.reshape(-1, 1)  

In [None]:
lgbm_test_mean = lgbm_kfold(parameters=params_tuned, folds=NFOLDS, shuffles=NSHUFFLES, rounds = 50000, xtrain=X_train, ytrain=y_train, xtest=X_test, seed=SEED)

In [None]:
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df['target'] =  lgbm_test_mean
sub_df.to_csv("submission.csv", index=False)