In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

path_to_file = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path_to_file.append( os.path.join(dirname, filename) )

path_to_file

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv(path_to_file[1]).iloc[:-1, :]
test = pd.read_csv(path_to_file[0])

In [None]:
df_all = pd.concat( [train.iloc[:, :-1], test], axis=0 )
df_all.shape

In [None]:
df_all.dropna(axis=1, how='all', inplace=True)
df_all.drop('ID', axis=1, inplace=True)
df_all.shape

In [None]:
scaler = StandardScaler()
X_num = df_all.select_dtypes(['int', 'float'])
X_num = pd.DataFrame( scaler.fit_transform( X_num.values), index=X_num.index, columns=X_num.columns )
impute = SimpleImputer(missing_values=np.nan, strategy='median', copy=False)
impute.fit(X_num)
real_imputer = pd.DataFrame(impute.transform(X_num), columns=X_num.columns)
real_imputer.head()

In [None]:
X_cat = df_all.select_dtypes('object')
cat_cols = [i for i in X_cat.columns if (X_cat[i].nunique(dropna=True) < 21) and (X_cat[i].nunique(dropna=True) >= 2) ]
data_dummies = pd.get_dummies(X_cat[cat_cols])
data_dummies.shape

In [None]:
df_concat = pd.concat([real_imputer, data_dummies], axis=0)
df_concat.fillna(0, inplace=True)
df_concat.shape

In [None]:
df_concat.head()

In [None]:
df_train = df_concat.iloc[:len(train), :]
df_test = df_concat.iloc[len(train):, :]
df_train.shape, df_test.shape

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 17)
X_train, X_val, y_train, y_val = train_test_split(df_train, train.iloc[:, -1], 
                                                  stratify=train.iloc[:, -1], shuffle = True, test_size = 0.2)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
params = {
    'n_estimators' : range(60, 120, 20),
    'learning_rate' : [0.1, 0.2],
    'max_depth': range(3, 5, 1)
}
n_iter_search = 10

clf = GradientBoostingClassifier(random_state=17)
rand_cv = RandomizedSearchCV(clf, param_distributions=params, scoring = 'roc_auc',
                              n_iter=n_iter_search, cv = skf, n_jobs=-1)

In [None]:
#%%time
#rand_cv.fit(X_train, y_train)
#rand_cv.best_params_, rand_cv.best_score_

In [None]:
#rand_cv.best_params_, rand_cv.best_score_
# {'n_estimators': 60, 'max_depth': 3, 'learning_rate': 0.1}

#({'n_estimators': 50, 'max_depth': 4, 'learning_rate': 0.10033333333333333},
# 0.7287491611419714)

In [None]:
#rand_cv.best_estimator_
estimator = GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=60,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


In [None]:
#model = rand_cv.best_estimator_
#model.fit(X_train, y_train)
#pred = model.predict_proba(X_val)[:, 1]
#roc_auc_score(y_val, pred)

In [None]:
def write_to_submission_file(predicted_labels, out_file, target='result', index_label="Id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index =  df_test.index,
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
estimator = GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=60,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [None]:
model = rand_cv.best_estimator_
model.fit(df_train, train.iloc[:, -1])
pred = model.predict_proba(df_test)[:, 1]

In [None]:

#np.arange(1, pred.shape[0] + 1)

In [None]:
write_to_submission_file(pred, "subm1.csv")