In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import tensorflow as tf
from sklearn.metrics import precision_recall_curve, auc
from keras.layers import Dropout, BatchNormalization, Activation
from keras.regularizers import l1_l2, l1
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score


import os
print(os.listdir("../input"))

In [None]:
# Read in the data
full_train_set = pd.read_csv('../input/train.csv')
submission_test_set = pd.read_csv('../input/test.csv')

In [None]:
y = full_train_set.target
X = full_train_set.drop('target', axis=1)
X.head()

In [None]:
train_unique_values = []
for col in full_train_set:
    train_unique_values.append(full_train_set[col].unique().shape[0])
    
train_unique_values = train_unique_values[2:]
    
test_unique_values = []
for col in submission_test_set:
    test_unique_values.append(submission_test_set[col].unique().shape[0])
    
test_unique_values = test_unique_values[1:]

len(train_unique_values), len(test_unique_values)

In [None]:
# https://www.kaggle.com/yag320/list-of-fake-samples-and-public-private-lb-split
df_test = pd.DataFrame.copy(submission_test_set)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

In [None]:
dropped_rows = submission_test_set.iloc[synthetic_samples_indexes]

In [None]:
# Drop the synthetic rows
#submission_test_set.drop(synthetic_samples_indexes, inplace=True) try this afterwards
submission_test_set.shape

In [None]:
# Move ID_code to index
X.set_index('ID_code', inplace=True)
submission_test_set.set_index('ID_code', inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#### New model architecture

In [None]:
# Model architecture base idea from 
# https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/applications/porto_seguro_keras_under_sampling.html
def make_model():
    model = Sequential()
    model.add(Dense(200, input_dim=200, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(50, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.15))
    model.add(Dense(25, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
    model = make_model()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=32, validation_split=0.2)
    return model

In [None]:
trained_model = fit_predict_imbalanced_model(X_train, y_train, X_test, y_test)

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### Try balanced minibatches

In [None]:
from imblearn.keras import BalancedBatchGenerator

def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model()
    training_generator = BalancedBatchGenerator(X_train, y_train,
                                                batch_size=32,
                                                random_state=42)
    model.fit_generator(generator=training_generator, epochs=100, verbose=2)
    return model

In [None]:
balanced_trained_model = fit_predict_balanced_model(X_train, y_train, X_test, y_test)

In [None]:
y_pred = balanced_trained_model.predict(X_test)
roc_auc_score(y_test, y_pred)

#### Class weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight
weights = compute_class_weight('balanced', [0, 1], y_train)

In [None]:
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
    model = make_model()
    model.fit(X_train, y_train, epochs=10, verbose=2, batch_size=1000, validation_split=0.2, class_weight=weights)
    return model

In [None]:
trained_model = fit_predict_imbalanced_model(X_train, y_train, X_test, y_test)

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### Undersampling 1:1

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=1, random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

In [None]:
np.bincount(y_res)

In [None]:
def fit_predict_undersampled_model(X_train, y_train):
    model = make_model()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=32, validation_split=0.2)
    return model

In [None]:
trained_model = fit_predict_undersampled_model(X_res, y_res)

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### Undersample 1:2

In [None]:
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

In [None]:
np.bincount(y_res)

In [None]:
def fit_predict_undersampled_model(X_train, y_train):
    model = make_model()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=1000, validation_split=0.2)
    return model

In [None]:
trained_model = fit_predict_undersampled_model(X_res, y_res)

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)

In [None]:
np.bincount(y_res)

In [None]:
def fit_predict_oversampled_model(X_train, y_train):
    model = make_model()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=1000, validation_split=0.2)
    return model

In [None]:
trained_model = fit_predict_oversampled_model(X_res, y_res)

In [None]:
aurocs = []
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)
counter = 1
for train_index, val_index in sss.split(X_train, y_train):
    print("Starting fold %i" % counter)
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X_train_cv, y_train_cv)
    trained_model = fit_predict_oversampled_model(X_res, y_res)
    y_pred = trained_model.predict(X_test)
    auroc = roc_auc_score(y_test, y_pred)
    print(auroc)
    aurocs.append(auroc)
    counter += 1

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### SMOTEENN

In [None]:
from imblearn.combine import SMOTEENN

In [None]:
np.bincount(y_res)

In [None]:
def fit_predict_oversampled_model(X_train, y_train):
    model = make_model()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=500)
    return model

In [None]:
aurocs = []
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=1)
counter = 1
for train_index, val_index in sss.split(X_train, y_train):
    print("Starting fold %i" % counter)
    X_train_cv, X_val_cv = X_train[train_index], X_train[val_index]
    y_train_cv, y_val_cv = y_train.iloc[train_index], y_train.iloc[val_index]
    
    sm = SMOTEENN(random_state=42)
    X_res, y_res = sm.fit_resample(X_train_cv, y_train_cv)
    np.bincount(y_res)
    trained_model = fit_predict_oversampled_model(X_res, y_res)
    y_pred = trained_model.predict(X_test)
    auroc = roc_auc_score(y_test, y_pred)
    print(auroc)
    aurocs.append(auroc)
    counter += 1

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

#### Data augmentation

In [None]:
y = full_train_set['target']
y.head()
X_filtered = full_train_set[y > 0].copy()
X_filtered.shape

In [None]:
# augment method from here: https://www.kaggle.com/roydatascience/eda-pca-simple-lgbm-on-kfold-technique
def augment(x,y,t=2):
    xs,xn = [],[]
    for i in range(t):
        print("First loop")
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        print("Second loop")
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)
    print("Final part")
    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = full_train_set[['ID_code', 'target']]
oof['predict'] = 0
predictions = submission_test_set[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()

In [None]:
features = [col for col in full_train_set.columns if col not in ['target', 'ID_code']]
X_test = submission_test_set[features].values

In [None]:
trained_model = fit_predict_oversampled_model(X_t, y_t) # Maybe remove validation split inside here
y_pred = trained_model.predict(X_val_cv)

val_score = roc_auc_score(y_val_cv, y_pred)
aurocs.append(val_score)
print(val_score)
counter += 1

In [None]:
aurocs = []
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
counter = 1
for train_index, val_index in sss.split(full_train_set, full_train_set['target']):
    print("Starting fold %i" % counter)
    X_train_cv, X_val_cv = full_train_set.iloc[train_index][features], full_train_set.iloc[val_index][features]
    y_train_cv, y_val_cv = full_train_set.iloc[train_index]['target'], full_train_set.iloc[val_index]['target']
    
    print(len(y_train_cv.values))
    X_t, y_t = augment(X_train_cv.values, y_train_cv.values)
    print(len(X_t))
    print(y_t)
    X_t = pd.DataFrame(X_t)
    X_t = X_t.add_prefix('var_')
    
    y_int = y_t.astype(int)
    np.bincount(y_int)
    
    trained_model = fit_predict_oversampled_model(X_t, y_t) # Maybe remove validation split inside here
    y_pred = trained_model.predict(X_val_cv)

    val_score = roc_auc_score(y_val_cv, y_pred)
    aurocs.append(val_score)
    print(val_score)
    counter += 1

In [None]:
val_score = roc_auc_score(y_val_cv, y_pred)
aurocs.append(val_score)
print(val_score)
counter += 1

#### Trying feature magic

In [None]:
# Feature magic from here: https://www.kaggle.com/dott1718/922-in-3-minutes
features = [x for x in full_train_set.columns if x.startswith("var")]

hist_df = pd.DataFrame()
for var in features:
    var_stats = full_train_set[var].append(submission_test_set[var]).value_counts()
    hist_df[var] = pd.Series(submission_test_set[var]).map(var_stats)
    hist_df[var] = hist_df[var] > 1

ind = hist_df.sum(axis=1) != 200
var_stats = {var:full_train_set[var].append(submission_test_set[ind][var]).value_counts() for var in features}

In [None]:
new_features = features.copy()

for feature in features:
    print(feature)
    column_values = var_stats[feature][full_train_set[feature]].values
    full_train_set['count_' + feature] = column_values 
    new_features.append('count_' + feature)

In [None]:
def make_model_new():
    model = Sequential()
    model.add(Dense(200, input_dim=400, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.25))
    model.add(Dense(50, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.15))
    model.add(Dense(25, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
for train_index, test_index in sss.split(full_train_set[new_features], full_train_set['target']):
    X_train, X_test = full_train_set[new_features].iloc[train_index], full_train_set[new_features].iloc[test_index]
    y_train, y_test = full_train_set['target'].iloc[train_index], full_train_set['target'].iloc[test_index]

In [None]:
def fit_new_features_model(X_train, y_train):
    model = make_model_new()
    model.fit(X_train, y_train, epochs=30, verbose=2, batch_size=128)
    return model

trained_model = fit_new_features_model(X_train, y_train)

In [None]:
y_pred = trained_model.predict(X_test)
auroc = roc_auc_score(y_test, y_pred)
print(auroc)

In [None]:
X_final_test = sc.fit_transform(submission_test_set)
X

In [None]:
y_submission_predict = classifier.predict(X_final_test)
print(y_submission_predict)
#y_submission_predict = y_submission_predict.astype(int)

In [None]:
y_final_predict = list()
for value in y_submission_predict:
    y_final_predict.append(value[0])

In [None]:
submission_variable = pd.DataFrame({'ID_code' : submission_test_set.index.values, 'target': y_final_predict})

In [None]:
submission_variable.to_csv('csv_to_submit.csv', index=False)

<a href="csv_to_submit.csv"> Download File </a>