In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

import os
print(os.listdir("../input"))

In [None]:
# Read in the data
full_train_set = pd.read_csv('../input/train.csv')
submission_test_set = pd.read_csv('../input/test.csv')

In [None]:
y = full_train_set.target
X = full_train_set.drop('target', axis=1)
X.head()

In [None]:
submission_test_set.head()

In [None]:
# Check class balance - class imbalance 1:9
y.value_counts()

In [None]:
X.shape, submission_test_set.shape

In [None]:
sns.pairplot(full_train_set.iloc[0:100, 0:7])

In [None]:
sns.heatmap(full_train_set.iloc[:, 0:7].corr(), annot=True)

In [None]:
full_train_set.var_0.unique().shape

In [None]:
submission_test_set.var_0.unique().shape

#### Big difference?

In [None]:
train_unique_values = []
for col in full_train_set:
    train_unique_values.append(full_train_set[col].unique().shape[0])
    
train_unique_values = train_unique_values[2:]
    
test_unique_values = []
for col in submission_test_set:
    test_unique_values.append(submission_test_set[col].unique().shape[0])
    
test_unique_values = test_unique_values[1:]

len(train_unique_values), len(test_unique_values)

In [None]:
plt.plot(train_unique_values)
plt.plot(test_unique_values)
plt.show()

#### Test set has two times less unique values for every feature!

In [None]:
# Finding of these synthetic and real samples with help from:
# https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split
df_test = pd.DataFrame.copy(submission_test_set)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

In [None]:
submission_test_set.shape

In [None]:
# Drop the synthetic rows
#submission_test_set.drop(synthetic_samples_indexes, inplace=True)
submission_test_set.shape

In [None]:
# Use stratifiedshufflesplit to get equal class imbalance in each set
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
# Create val set
for train_index, val_index in sss.split(X_train, y_train):
    X_train, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

In [None]:
# Move ID_code to index
X_train.set_index('ID_code', inplace=True)
X_val.set_index('ID_code', inplace=True)
X_test.set_index('ID_code', inplace=True)
submission_test_set.set_index('ID_code', inplace=True)

In [None]:
# Feature selection
#from boostaroota import BoostARoota
#r = BoostARoota(metric='aucpr')
#br = BoostARoota(metric='logloss')
#Fit the model for the subset of variables
#br.fit(X_train, y_train)

#Can look at the important variables - will return a pandas series
#br.keep_vars_

#Then modify dataframe to only include the important variables
#br.transform(x)

In [None]:
# Boruta
#from sklearn.ensemble import RandomForestClassifier
#from boruta import BorutaPy
#rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced', max_depth=6)
#boruta_selector = BorutaPy(rfc, n_estimators='auto', verbose=2)
#boruta_selector.fit(X_train.values, y_train.values)

In [None]:
import catboost

In [None]:
# Train CatBoost model as base reference
from catboost import CatBoostClassifier, Pool, cv
model = CatBoostClassifier(
    eval_metric='WKappa',
    random_seed=42,
    logging_level='Silent',
    use_best_model=True
)

model.fit(
    X_train, y_train,
    #cat_features=categorical_features_indices,
    eval_set=(X_val, y_val),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
)

In [None]:
# Predict and find AUC on test set
from sklearn.metrics import roc_auc_score, confusion_matrix
y_predict = model.predict(X_test)
roc_test_score = roc_auc_score(y_test, y_predict)
print(roc_test_score)
cm = confusion_matrix(y_test, y_predict)
print(cm)

In [None]:
np.arange(0, 100, 5)

In [None]:
from hyperopt import hp, Trials, fmin, tpe, STATUS_OK
space = {
    'iterations': hp.quniform('iterations', 500, 4000, 50),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'l2_leaf_reg': hp.quniform('l2_leaf_reg', 1, 50, 1),
    'random_strength' : hp.quniform('random_strength', 1, 50, 1),
    'depth' : hp.quniform('depth', 1, 11, 1),
    'early_stopping_rounds' : hp.quniform('early_stopping_rounds', 0, 100, 5),
    'od_type' : True,
}

In [None]:
import hyperopt.pyll.stochastic
print(hyperopt.pyll.stochastic.sample(space))

In [None]:
import time
counter = 1
def objective(params):
    global counter
    start_time = time.time()
    print("Starting %s trial" % counter)
    clf = CatBoostClassifier(**params)
    clf.use_best_model = True
    clf.random_seed = 42
    
    X_trial = pd.DataFrame.copy(X)
    y_trial = pd.DataFrame.copy(y)
    
    X_trial.set_index('ID_code', inplace=True)
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)

    for train_index, val_index in sss.split(X_trial, y_trial):
        X_train, X_val = X_trial.iloc[train_index], X_trial.iloc[val_index]
        y_train, y_val = y_trial.iloc[train_index], y_trial.iloc[val_index]
        
    
    print("Started training...")
    model.fit(X_train, y_train,
        eval_set=(X_val, y_val))

    y_predict = model.predict(X_val)
    roc_test_score = roc_auc_score(y_val, y_predict)
    counter += 1
    
    end_time = time.time()
    duration = (end_time - start_time) / 60
    
    print(params)
    print("AUROC is %s, took %f minutes" % (roc_test_score, duration))
    return {'loss': 1 - roc_test_score, 'auroc': roc_test_score, 'params': params, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
print(best)

In [None]:
y_submission_predict = model.predict(submission_test_set)
y_submission_predict = y_submission_predict.astype(int)

In [None]:
y_submission_predict

In [None]:
np.bincount(y_submission_predict)

In [None]:
submission_variable = pd.DataFrame({'ID_code' : submission_test_set.index.values, 'target': y_submission_predict})

In [None]:
submission_variable

In [None]:
submission_variable.to_csv('csv_to_submit.csv', index=False)

<a href="csv_to_submit.csv"> Download File </a>