In [1]:
import numpy as np
import pandas as pd
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn import metrics, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn import svm, neighbors, linear_model, neural_network
from sklearn.svm import NuSVC
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier, RadiusNeighborsRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression, SGDClassifier
import lightgbm as lgb
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
import datetime

from tqdm import tqdm
from sklearn.covariance import GraphicalLasso
from sklearn.mixture import GaussianMixture
import warnings
warnings.filterwarnings('ignore')

In [2]:
# two pre-trained csv file from https://www.kaggle.com/infinitewing/ensemble-v2?scriptVersionId=15845030
oof = pd.read_csv('../input/oof_pred_v3.csv')
oof_v5 = pd.read_csv('../input/oof_pred_v5.csv')
test_preds = pd.read_csv('../input/test_pred_v3.csv')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
original_target = train['target'].copy().values

In [3]:
oof['target_mean'] = oof['oof_qda']*0.23 + \
                        oof['oof_gmm']*0.23 + \
                        oof['oof_nusvc']*0.23 + \
                        oof['oof_log']*0.23 + \
                        oof_v5['oof_knn']*0.08
print(roc_auc_score(original_target, oof['target_mean']))

0.9698331699467533


In [4]:
def knn_model(x,y,oof_x,test_x):
    model = KNeighborsClassifier(19,p=1.9,n_jobs=-1)
    model.fit(x,y)
    oof = model.predict_proba(oof_x)
    preds = model.predict_proba(test_x)
    return oof, preds
def qda_model(x,y,oof_x,test_x):
    model = QuadraticDiscriminantAnalysis(0.1)
    model.fit(x,y)
    oof = model.predict_proba(oof_x)
    pred = model.predict_proba(test_x)
    return oof, pred
def log_model(x,y,oof_x,test_x):
    model = LogisticRegression(solver='liblinear',penalty='l2',C=0.001,tol=0.0001,random_state=0,max_iter=1000,n_jobs=-1)
    model.fit(x,y)
    oof = model.predict_proba(oof_x)
    pred = model.predict_proba(test_x)
    return oof, pred

def nusvc_model(x,y,oof_x,test_x):
    model = NuSVC(probability=True, kernel='poly', degree=4, gamma='auto', random_state=4, nu=0.7, coef0=0.053)
    model.fit(x,y)
    oof = model.predict_proba(oof_x)
    pred = model.predict_proba(test_x)
    return oof, pred

def gmm_model(x,y,oof_x,test_x):
    def get_mean_cov(x,y):
        model = GraphicalLasso()
        ones = (y==1).astype(bool)
        x2 = x[ones]
        model.fit(x2)
        p1 = model.precision_
        m1 = model.location_
        
        onesb = (y==0).astype(bool)
        x2b = x[onesb]
        model.fit(x2b)
        p2 = model.precision_
        m2 = model.location_
        
        ms = np.stack([m1,m2])
        ps = np.stack([p1,p2])
        return ms,ps
        
    ms, ps = get_mean_cov(x,y)
    model = GaussianMixture(n_components=2, init_params='random', covariance_type='full', tol=0.001,reg_covar=0.001, max_iter=250, n_init=1,means_init=ms, precisions_init=ps)
    model.fit(np.concatenate([x,test_x],axis = 0))
    #model.fit(x)
    oof = model.predict_proba(oof_x)
    pred = model.predict_proba(test_x)
    
    tmp = oof[:,0].copy()
    oof[:,0] = oof[:,1]
    oof[:,1] = tmp
    
    tmp = pred[:,0].copy()
    pred[:,0] = pred[:,1]
    pred[:,1] = tmp
    
    return oof, pred

In [5]:
gmm = oof['oof_gmm'].rank().values / len(oof['oof_gmm'])
qda = oof['oof_qda'].rank().values / len(oof['oof_gmm'])
nusvc = oof['oof_nusvc'].rank().values / len(oof['oof_gmm'])
log = oof['oof_log'].rank().values / len(oof['oof_gmm'])

models = [qda, log, nusvc, gmm]
models_name = ['qda', 'log', 'nusvc', 'gmm']
models_threshold = [0.3, 0.2, 0.3, 0.3]

targets = oof['target'].copy().values
new_preds = np.array([[-1 for _ in range(len(models))] for _ in range(len(targets))]).astype('float')
print(new_preds.shape)
for i, (model, name, threshold) in enumerate(zip(models, models_name, models_threshold)):
    preds = model.copy().astype('float')
    # 把符合比例的preds id 記錄下來，之後比較variance，沒有差異的話就加入flip的行列
    # OOF的時候flip可以達到0.974x的acc，代表幾乎是正確的標籤
    preds[preds > 1-threshold] = 1
    preds[preds < threshold] = 0
    new_preds[:,i] = preds.astype('float')
    sure_idx = np.concatenate((np.where(preds==1)[0], np.where(preds==0)[0]))
    acc = accuracy_score(targets[sure_idx], new_preds[sure_idx,i])
    print('{} {}% acc: {}'.format(name, threshold, acc))
use_idx = []
use_preds = []
for i in range(new_preds.shape[0]):  
    use_preds.append(-1)
    if(i%50000==0): print(i)
    # 檢查不同模型是否對該預測點出現分歧，出現分歧者自然不加入flip的行列
    values = []
    for j in range(new_preds.shape[1]):
        if(new_preds[i,j] == 0 or new_preds[i,j] == 1):
            values.append(new_preds[i,j])
    if(len(values) == 0): continue
    if(np.std(np.array(values)) == 0 and len(values) > 1):
        use_idx.append(i)
        use_preds[i] = values[0]
use_idx = np.array(use_idx)    
use_preds = np.array(use_preds)    
print('Totally has {} ~97.4% accuracy daya'.format(len(use_idx)))


(262144, 4)
qda 0.3% acc: 0.9729475417548812
log 0.2% acc: 0.9727915160647358
nusvc 0.3% acc: 0.9720574491216694
gmm 0.3% acc: 0.9743653321634973
0
50000
100000
150000
200000
250000
Totally has 162860 ~97.4% accuracy daya


In [6]:
base_preds = oof['target_mean'].copy().values
auc = roc_auc_score(original_target, base_preds)
print(auc)
base_preds[use_idx] = use_preds[use_idx]
acc = accuracy_score(original_target[use_idx], base_preds[use_idx])
print(acc)


0.9698331699467533
0.9740390519464571


In [7]:
print('these incorrect label seems to be flipped target, re-fix it!')
not_match_idx = np.where(use_preds[use_idx] != targets[use_idx])[0]
print(len(not_match_idx))

these incorrect label seems to be flipped target, re-fix it!
4228


In [8]:
oof['target'][use_idx[not_match_idx]] = use_preds[use_idx[not_match_idx]]

In [9]:
baseline_qda_preds = oof['oof_qda'].copy().values
baseline_log_preds = oof['oof_log'].copy().values
baseline_nusvc_preds = oof['oof_nusvc'].copy().values
baseline_gmm_preds = oof['oof_gmm'].copy().values
baseline_knn_preds = oof_v5['oof_knn'].copy().values
targets = train['target'].copy().values
# 將應該是錯誤的y labels 重新flip
train['target'][use_idx[not_match_idx]] = use_preds[use_idx[not_match_idx]]
cols = [c for c in train.columns if c not in ['id', 'target','wheezy-copper-turtle-magic']]

m0 = knn_model
m1 = qda_model
m2 = log_model
m3 = nusvc_model
m4 = gmm_model
models_baseline = [baseline_knn_preds,baseline_qda_preds,baseline_log_preds,baseline_nusvc_preds,baseline_gmm_preds]
models = [('knn',m0), ('qda',m1), ('log',m2), ('nusvc',m3), ('gmm',m4)]
oofs = np.zeros((len(oof),len(models)))
preds = np.zeros((len(test),len(models)))
ps_oofs = np.zeros((len(oof),len(models)))
ps_preds = np.zeros((len(test),len(models)))
for a, (model_name, model) in enumerate(models):
    for i in (range(512)):
        #if(i%50 == 0): print(i)
        train2 = train[train['wheezy-copper-turtle-magic']==i].copy()
        test2 = test[test['wheezy-copper-turtle-magic']==i].copy()
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)
        y = train2['target'].copy().values
        
        
        if(model_name == 'log'):
            poly = PolynomialFeatures(degree=2)
            sc = StandardScaler()
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = poly.fit_transform(sc.fit_transform(VarianceThreshold(threshold=2).fit_transform(data[cols])))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]

        if(model_name == 'qda' or model_name == 'gmm' or model_name == 'knn'):
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = StandardScaler().fit_transform(VarianceThreshold(threshold=2).fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]


        if(model_name == 'nusvc'):
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]
        
        skf = StratifiedKFold(n_splits=21, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3, y):
            _oof, _pred = model(train3[train_index,:],            # x
                                y[train_index],                   # y
                                train3[test_index,:], test3)      # oof_x, test_x
            oofs[idx1[test_index],a] = _oof[:,1]
            preds[idx2,a] += _pred[:,1] / skf.n_splits
    auc = roc_auc_score(original_target, oofs[:,a])
    print('{} with flip-y auc: {}'.format(model_name, round(auc,6)))
    auc = roc_auc_score(original_target, models_baseline[a])
    print('{} original auc: {}'.format(model_name, round(auc,6)))
auc = roc_auc_score(original_target, np.mean(oofs,axis=1))
print('{} with flip-y auc: {}'.format('ensemble', round(auc,6)))

knn with flip-y auc: 0.92269
knn original auc: 0.91923
qda with flip-y auc: 0.967287
qda original auc: 0.965618
log with flip-y auc: 0.956718
log original auc: 0.953721
nusvc with flip-y auc: 0.966879
nusvc original auc: 0.964883
gmm with flip-y auc: 0.969061
gmm original auc: 0.969059
ensemble with flip-y auc: 0.970144


In [10]:
#step1_preds = np.mean(preds, axis=1)
# first one is knn
step1_preds = preds[:,0]*0.08 + \
                 + preds[:,1]*0.23 + preds[:,2]*0.23 + preds[:,3]*0.23 + preds[:,4]*0.23
test['target'] = step1_preds

這裡重複剛才選要flip的步驟，找到合適的test preds label，因為他的accuracy有97.4%以上，所以非常適合！
首先建立DataFrame，因為需要使用到.rank()
models = [('knn',m0), ('qda',m1), ('log',m2), ('nusvc',m3), ('gmm',m4)]

In [11]:
preds_df = pd.DataFrame()
for a, (model_name, model) in enumerate(models):
    print(model_name)
    preds_df['preds_{}'.format(model_name)] = preds[:,a]

knn
qda
log
nusvc
gmm


In [12]:
gmm2 = preds_df['preds_gmm'].rank().values / len(preds_df['preds_gmm'])
qda2 = preds_df['preds_qda'].rank().values / len(preds_df['preds_gmm'])
nusvc2 = preds_df['preds_nusvc'].rank().values / len(preds_df['preds_gmm'])
log2 = preds_df['preds_log'].rank().values / len(preds_df['preds_gmm'])

In [13]:
models = [qda2, log2, nusvc2, gmm2]
models_name = ['qda', 'log', 'nusvc', 'gmm']
models_threshold = [0.3, 0.2, 0.3, 0.3]

new_preds2 = np.array([[-1 for _ in range(len(models))] for _ in range(len(nusvc2))]).astype('float')
print(new_preds.shape)
for i, (model, name, threshold) in enumerate(zip(models, models_name, models_threshold)):
    preds = model.copy().astype('float')
    # 把符合比例的preds id 記錄下來，之後比較variance，沒有差異的話就加入pseudo的行列
    # OOF的時候可以達到0.974x的acc，代表幾乎是正確的標籤
    preds[preds > 1-threshold] = 1
    preds[preds < threshold] = 0
    new_preds2[:,i] = preds.astype('float')
    sure_idx = np.concatenate((np.where(preds==1)[0], np.where(preds==0)[0]))
use_idx2 = []
use_preds2 = []
for i in range(new_preds2.shape[0]):  
    use_preds2.append(-1)
    if(i%50000==0): print(i)
    # 檢查不同模型是否對該預測點出現分歧，出現分歧者自然不加入pseudo的行列
    values = []
    for j in range(new_preds2.shape[1]):
        if(new_preds2[i,j] == 0 or new_preds2[i,j] == 1):
            values.append(new_preds2[i,j])
    if(len(values) == 0): continue
    if(np.std(np.array(values)) == 0 and len(values) > 1):
        use_idx2.append(i)
        use_preds2[i] = values[0]
use_idx2 = np.array(use_idx2)    
use_preds2 = np.array(use_preds2)    
print('Totally has {} ~97.4% accuracy data for pseudo labeling!!!'.format(len(use_idx2)))

(262144, 4)
0
50000
100000
Totally has 81422 ~97.4% accuracy data for pseudo labeling!!!


In [14]:
preds_df['use_target'] = use_preds2.astype('int32')
preds_df['useful'] = np.zeros(len(use_preds2))
preds_df['useful'][use_idx2] = 1
preds_df['wheezy-copper-turtle-magic'] = test['wheezy-copper-turtle-magic'].copy().values
print(preds_df.head(10))

   preds_knn     preds_qda  preds_log  preds_nusvc     preds_gmm  use_target  \
0   0.498747  9.999991e-01   0.571359     0.879075  9.999999e-01           1   
1   0.488722  9.974835e-01   0.537612     0.881171  9.986900e-01          -1   
2   0.533835  1.011638e-07   0.492571     0.093782  2.070371e-10           0   
3   0.403509  1.481468e-01   0.469918     0.633640  4.578538e-03          -1   
4   0.278195  4.960512e-01   0.454024     0.053372  4.776994e-02          -1   
5   0.210526  1.893203e-08   0.266370     0.003395  1.371139e-10           0   
6   0.150376  2.442548e-11   0.274422     0.001483  9.364499e-10           0   
7   0.766917  9.999998e-01   0.655710     0.992062  1.000000e+00           1   
8   0.453634  9.999268e-01   0.555846     0.760626  9.999940e-01           1   
9   0.395990  1.108485e-02   0.463084     0.206428  2.951555e-04          -1   

   useful  wheezy-copper-turtle-magic  
0     1.0                         259  
1     0.0                         252  

In [15]:
m0 = knn_model
m1 = qda_model
m2 = log_model
m3 = nusvc_model
m4 = gmm_model
models_baseline = [baseline_knn_preds,baseline_qda_preds,baseline_log_preds,baseline_nusvc_preds,baseline_gmm_preds]
models = [('knn',m0), ('qda',m1), ('log',m2), ('nusvc',m3), ('gmm',m4)]
# pseudo label
for a, (model_name, model) in enumerate(models):
    for i in (range(512)):
        train2 = train[train['wheezy-copper-turtle-magic']==i].copy()
        test2 = test[test['wheezy-copper-turtle-magic']==i].copy()
        preds_df2 = preds_df[preds_df['wheezy-copper-turtle-magic']==i].copy()
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)
        test2.reset_index(drop=True,inplace=True)
        preds_df2.reset_index(drop=True,inplace=True)
        y = train2['target'].copy().values
        test_y = preds_df2['use_target'].copy().values
        
        #找到有用的idx，這裡是[0 ~ length]
        pesudo_useful_idx = np.where(preds_df2['useful']==1)[0]
        
        if(model_name == 'log'):
            poly = PolynomialFeatures(degree=2)
            sc = StandardScaler()
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = poly.fit_transform(sc.fit_transform(VarianceThreshold(threshold=2).fit_transform(data[cols])))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]

        if(model_name == 'qda' or model_name == 'gmm' or model_name == 'knn'):
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = StandardScaler().fit_transform(VarianceThreshold(threshold=2).fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]


        if(model_name == 'nusvc'):
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]
            
        # 如果有可以用的pseudo label,將其加入train data中
        # test_y因為剛才已經有做過0-1二元化，所以直接加入
        if(len(pesudo_useful_idx) > 0):
            train3 = np.concatenate((train3, test3[pesudo_useful_idx,:]))
            y = np.concatenate((y,test_y[pesudo_useful_idx]))
            
        skf = StratifiedKFold(n_splits=21, random_state=42, shuffle=True)
        for train_index, test_index in skf.split(train3, y):
            test_index3 = test_index[test_index<len(train2)]      # ignore pseudo in oof
            _oof, _pred = model(train3[train_index,:],            # x
                                y[train_index],                   # y
                                train3[test_index3,:], test3)      # oof_x, test_x
            ps_oofs[idx1[test_index3],a] = _oof[:,1]
            ps_preds[idx2,a] += _pred[:,1] / skf.n_splits
    auc = roc_auc_score(original_target, ps_oofs[:,a])
    print('{} with flip-y + pseudo label auc: {}'.format(model_name, round(auc,6)))
    auc = roc_auc_score(original_target, models_baseline[a])
    print('{} original auc: {}'.format(model_name, round(auc,6)))
auc = roc_auc_score(original_target, np.mean(ps_oofs,axis=1))
print('{} with flip-y + pseudo label auc: {}'.format('ensemble', round(auc,6)))

KeyboardInterrupt: 

In [None]:
best_ps_oof = ps_oofs[:,0]*0.08 + \
                 + ps_oofs[:,1]*0.23 + ps_oofs[:,2]*0.23 + ps_oofs[:,3]*0.23 + ps_oofs[:,4]*0.23
auc = roc_auc_score(original_target, best_ps_oof)
print('{} with flip-y + pseudo label auc: {}'.format('best weight ensemble', round(auc,6)))

In [None]:
final_preds = ps_preds[:,0]*0.08 + \
                 + ps_preds[:,1]*0.23 + ps_preds[:,2]*0.23 + ps_preds[:,3]*0.23 + ps_preds[:,4]*0.23
sub = pd.DataFrame()
sub['id'] = test['id']
sub['target'] = final_preds
sub.to_csv('submission_flipy_pseudo_5models.csv', index=False)

In [None]:
def lgbm(x, y, oof_x, test_x):
    lgb_params = {
        'learning_rate': 0.01,
        'application': 'binary',
        'max_depth': 6,
        'num_leaves': 64,
        'verbosity': -1,
        'metric': 'auc'
    }
    d_train = lgb.Dataset(x, label=y)
    model = lgb.train(lgb_params, train_set=d_train, num_boost_round=330, verbose_eval=1000)

    oof = model.predict(oof_x)
    preds = model.predict(test_x)
    return oof, preds

In [None]:
oof_df = pd.DataFrame()
test_df = pd.DataFrame()
oof_df['target'] = train['target'].copy().values
for a, (model_name, model) in enumerate(models):
    oof_df['{}'.format(model_name)] = ps_oofs[:,a]
    test_df['{}'.format(model_name)] = ps_preds[:,a]
print(oof_df.head())
print(test_df.head())

In [None]:
cols2 = [c for c in oof_df.columns if c not in ['id', 'target','wheezy-copper-turtle-magic']]
trainX = oof_df[cols2].values
y = oof_df['target'].values
testX = test_df[cols2].values
stack_oofs = np.zeros(len(trainX))
stack_preds = np.zeros(len(testX))

skf = StratifiedKFold(n_splits=11, random_state=3228, shuffle=True)
for train_index, test_index in skf.split(trainX, y):
    _oof, _preds = lgbm(trainX[train_index,:],                # x
                            y[train_index],                   # y
                            trainX[test_index,:], testX)      # oof_x, test_x
    stack_oofs[test_index] = _oof
    stack_preds += _preds / skf.n_splits
auc = roc_auc_score(original_target, stack_oofs)
print('{} oof auc: {}'.format('final stacking', round(auc,6)))

In [None]:
sub = pd.DataFrame()
sub['id'] = test['id']
sub['target'] = stack_preds
sub.to_csv('submission_flipy_pseudo_stacking_5models.csv', index=False)