In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv('../input/sample_submission.csv')

cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]

In [3]:
oof_qda_0 = np.zeros(len(train))
preds_qda = np.zeros(len(test))
params = [{'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5]}]

# 512 models
reg_params = np.zeros(512)
for i in tqdm_notebook(range(512)):
    train_magic = train[train['wheezy-copper-turtle-magic']==i]
    test_magic = test[test['wheezy-copper-turtle-magic']==i]
    train_magic_idx = train_magic.index
    test_magic_idx = test_magic.index
    train_magic.reset_index(drop=True,inplace=True)

    full_data = pd.concat([pd.DataFrame(train_magic[cols]), pd.DataFrame(test_magic[cols])])
    pipe = Pipeline([('vt', VarianceThreshold(threshold=1.5)), ('scaler', StandardScaler())])
    full_data_vt = pipe.fit_transform(full_data[cols])
    train_magic_vt = full_data_vt[:train_magic.shape[0]]
    test_magic_vt = full_data_vt[train_magic.shape[0]:]

    skf = StratifiedKFold(n_splits=11, random_state=42)
    for train_index, valid_index in skf.split(train_magic, train_magic['target']):
        X_train_2 = train_magic_vt[train_index,:]
        X_valid_2 = train_magic_vt[valid_index,:]
        y_train = train_magic.loc[train_index, 'target']

        qda_clf = QuadraticDiscriminantAnalysis()
        grid_search_clf = GridSearchCV(qda_clf, params, cv=4)
        grid_search_clf.fit(X_train_2, y_train)
        reg_params[i] = grid_search_clf.best_params_['reg_param']
        
        oof_qda_0[train_magic_idx[valid_index]] = grid_search_clf.predict_proba(X_valid_2)[:,1]
        preds_qda[test_magic_idx] += grid_search_clf.predict_proba(test_magic_vt)[:,1] / skf.n_splits

HBox(children=(IntProgress(value=0, max=512), HTML(value='')))




In [4]:
print('QDA', 'ROC AUC: {0:.5}'.format(roc_auc_score(train['target'], oof_qda_0)))

QDA ROC AUC: 0.96462


In [5]:
sub['target'] = preds_qda
sub.to_csv('qda.csv', index=False)

In [6]:
for itr in range(10):
    test['target'] = preds_qda
    test.loc[test['target'] > 0.915, 'target'] = 1
    test.loc[test['target'] < 0.085, 'target'] = 0
    pl_test = test[(test['target'] == 1) | (test['target'] == 0)]
    new_train = pd.concat([train, pl_test])
    new_train.reset_index(drop=True, inplace=True)
    new_train.loc[oof_qda_0 > 0.995, 'target'] = 1
    new_train.loc[oof_qda_0 < 0.005, 'target'] = 0
    print(pl_test.shape[0], "Test Records added for iteration number {}".format(itr))
    
    oof_qda = np.zeros(len(new_train))
    preds_qda = np.zeros(len(test))
    
    for i in tqdm_notebook(range(512)):
        train_magic = new_train[new_train['wheezy-copper-turtle-magic']==i]
        test_magic = test[test['wheezy-copper-turtle-magic']==i]
        train_magic_idx = train_magic.index
        test_magic_idx = test_magic.index
        train_magic.reset_index(drop=True,inplace=True)
        
        full_data = pd.concat([pd.DataFrame(train_magic[cols]), pd.DataFrame(test_magic[cols])])
        pipe = Pipeline([('vt', VarianceThreshold(threshold=1.5)), ('scaler', StandardScaler())])
        full_data_vt = pipe.fit_transform(full_data[cols])
        train_magic_vt = full_data_vt[:train_magic.shape[0]]
        test_magic_vt = full_data_vt[train_magic.shape[0]:]

        skf = StratifiedKFold(n_splits=11, random_state=42)
        for train_index, valid_index in skf.split(train_magic, train_magic['target']):
            X_train_2 = train_magic_vt[train_index,:]
            X_valid_2 = train_magic_vt[valid_index,:]
            y_train = train_magic.loc[train_index, 'target']
            
            qda_clf = QuadraticDiscriminantAnalysis(reg_params[i])
            qda_clf.fit(X_train_2, y_train)

            oof_qda[train_magic_idx[valid_index]] = qda_clf.predict_proba(X_valid_2)[:,1]
            preds_qda[test_magic_idx] += qda_clf.predict_proba(test_magic_vt)[:,1] / skf.n_splits
    
    print('QDA', 'ROC AUC: {0:.5}'.format(roc_auc_score(train['target'], oof_qda[:len(train)])))
    
    sub['target'] = preds_qda
    sub.to_csv('qda_{}.csv'.format(itr), index=False)

109666 Test Records added for iteration number 0


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97113
121408 Test Records added for iteration number 1


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97054
122234 Test Records added for iteration number 2


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97048
122333 Test Records added for iteration number 3


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97047
122352 Test Records added for iteration number 4


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97047
122359 Test Records added for iteration number 5


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97047
122360 Test Records added for iteration number 6


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))


QDA ROC AUC: 0.97047
122361 Test Records added for iteration number 7


HBox(children=(IntProgress(value=0, max=512), HTML(value='')))