In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from scipy.optimize import fmin
from functools import partial

In [2]:
y_val = np.load("Preds/True_val.npy")
y_test = pd.read_csv("Data/Sample_Output.csv", names = ['ID', 'Target'], header=None)

In [225]:
lgbm_val = np.load("Preds/LGB_val.npy")
rf_val = np.load("Preds/RF_val.npy")
lr_val = np.load("Preds/LR_val.npy")
nn_val = np.load("Preds/NN_val.npy")
xgb_val = np.load("Preds/XGB_val.npy")
ada_val = np.load("Preds/ADA_val.npy")

In [257]:
val_probs = np.array([lgbm_val, nn_val, xgb_val, ada_val, lr_val, rf_val])

In [35]:
val_ensb = np.sum(val_probs*np.array([1.8,  1.57, 0.95, 1])[:,None, None], axis=0)

In [36]:
accuracy_score(y_val, np.argmax(val_ensb, axis=1))

0.8190653282391737

In [258]:
class OptimizeAcc:
    def __init__(self):
        self.coef_ = 0
        
    def _acc(self, coef, val_probs, y_val):
        val_ensb = np.sum(val_probs*coef[:,None, None], axis=0)
        acc = accuracy_score(y_val, np.argmax(val_ensb, axis=1))
        return -1.0*(acc)
    
    def fit(self, val_probs, y_val):
        partial_loss = partial(self._acc, val_probs=val_probs, y_val=y_val)
        init_coef = np.random.random(val_probs.shape[0])
        self.coef_ = fmin(partial_loss, init_coef, disp=True)

In [265]:
opt = OptimizeAcc()
opt.fit(val_probs, y_val)

Optimization terminated successfully.
         Current function value: -0.819196
         Iterations: 81
         Function evaluations: 170


In [230]:
opt.coef_

array([0.6918508 , 0.40497931, 0.1363975 , 0.79349025])

In [232]:
lgbm_test = np.load("Preds/LGB_test.npy")
rf_test = np.load("Preds/RF_test.npy")
lr_test = np.load("Preds/LR_test.npy")
nn_test = np.load("Preds/NN_test.npy")
xgb_test = np.load("Preds/XGB_test.npy")
ada_test = np.load("Preds/ADA_test.npy")

test_probs = np.array([lgbm_test,nn_test, xgb_test, ada_test])
test_ensb = np.sum(test_probs*opt.coef_[:,None, None], axis=0)

In [233]:
y_test['Target'] = np.argmax(test_ensb, axis=1)

y_test

y_test.to_csv("Submissions/DSC_Nexus_UMPW9415_IITM_ensemble_top3_ADA.csv", index=0, header=None)

# Rank Averaging

In [266]:
val_rank_probs = val_probs.copy()
for ind, arr in enumerate(val_rank_probs):
    val_rank_probs[ind] = pd.DataFrame(arr).rank(axis=1).values

In [272]:
opt_rank = OptimizeAcc()
opt_rank.fit(val_rank_probs, y_val)

Optimization terminated successfully.
         Current function value: -0.817700
         Iterations: 27
         Function evaluations: 101


In [224]:
opt_rank.coef_

array([0.62964064, 0.07667129, 0.29822923, 0.4007404 ])

array([0.16271343, 0.25211302, 0.91008271, 0.90158767])

In [216]:
lgbm_test = np.load("Preds/LGB_test.npy")
rf_test = np.load("Preds/RF_test.npy")
lr_test = np.load("Preds/LR_test.npy")
nn_test = np.load("Preds/NN_test.npy")
xgb_test = np.load("Preds/XGB_test.npy")

test_rank_probs = np.array([lgbm_test, lr_test, nn_test, xgb_test])
for ind, arr in enumerate(test_rank_probs):
    test_rank_probs[ind] = pd.DataFrame(arr).rank(axis=1).values
test_rank_ensb = np.sum(test_rank_probs*opt_rank.coef_[:,None, None], axis=0)

In [217]:
y_test['Target'] = np.argmax(test_rank_ensb, axis=1)

y_test

y_test.to_csv("Submissions/DSC_Nexus_UMPW9415_IITM_ensemble_top4_rank.csv", index=0, header=None)

# Stacking using LR

In [55]:
lgbm_tr = np.load("Preds/LGB_tr.npy")
rf_tr = np.load("Preds/RF_tr.npy")
lr_tr = np.load("Preds/LR_tr.npy")
nn_tr = np.load("Preds/NN_tr.npy")
xgb_tr = np.load("Preds/XGB_tr.npy")

In [66]:
tr_probs = np.array([lgbm_tr, lr_tr, nn_tr, xgb_tr])

In [108]:
train = pd.DataFrame(tr_probs.reshape(lgbm_tr.shape[0], -1, order='F'))#.to_csv("Preds/preds_tr.csv", index=0)

In [109]:
val = pd.DataFrame(val_probs.reshape(len(y_val), -1, order='F'))#.to_csv("Preds/preds_val.csv", index=0)

In [110]:
test = pd.DataFrame(test_probs.reshape(len(y_test), -1, order='F'))#.to_csv("Preds/preds_test.csv", index=0)

In [111]:
for dat in [train, test, val]:
    for i in range(0, 13, 4):
        dat[f"{i}_mean"] = dat[[i, i+1, i+2,i+3]].mean(axis=1)
        dat[f"{i}_std"] = dat[[i, i+1, i+2,i+3]].std(axis=1)
        for pairnum, pair in enumerate([(i, i+1), (i, i+2), (i, i+3), (i+1, i+2),(i+1, i+3), (i+2,i+3)]):
            dat[f"{pair[0]}_{pair[1]}_diff"] = dat[pair[0]] - dat[pair[1]]
        

In [112]:
train.iloc[:,-15:]

Unnamed: 0,8_std,8_9_diff,8_10_diff,8_11_diff,9_10_diff,9_11_diff,10_11_diff,12_mean,12_std,12_13_diff,12_14_diff,12_15_diff,13_14_diff,13_15_diff,14_15_diff
0,0.018960,-0.032084,-0.033314,0.000256,-0.001230,0.032340,0.033570,0.028741,0.040524,-0.085212,-0.020635,0.000757,0.064577,0.085969,0.021392
1,0.029556,-0.062749,-0.021840,-0.000117,0.040909,0.062632,0.021723,0.022300,0.032690,-0.068737,-0.011838,-0.000905,0.056899,0.067832,0.010934
2,0.041410,-0.087118,-0.029074,0.001432,0.058045,0.088550,0.030505,0.039521,0.055776,-0.116807,-0.030087,0.002370,0.086719,0.119176,0.032457
3,0.027564,-0.042397,-0.052136,-0.000045,-0.009739,0.042352,0.052090,0.039694,0.049773,-0.104744,-0.049140,-0.000443,0.055604,0.104301,0.048697
4,0.206431,0.414689,0.414541,0.409265,-0.000147,-0.005424,-0.005276,0.063421,0.116291,0.237035,0.236604,0.223012,-0.000431,-0.014023,-0.013592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390071,0.005097,-0.010785,-0.002171,-0.000177,0.008614,0.010608,0.001993,0.003625,0.004027,-0.008648,-0.003655,-0.000277,0.004993,0.008371,0.003378
390072,0.148033,0.054819,0.030758,-0.264120,-0.024061,-0.318939,-0.294878,0.092758,0.142773,0.029160,0.009287,-0.271691,-0.019873,-0.300851,-0.280978
390073,0.160033,0.140616,0.132232,-0.202056,-0.008384,-0.342671,-0.334288,0.084790,0.123944,0.040105,0.043480,-0.216860,0.003375,-0.256965,-0.260340
390074,0.167668,0.165533,0.136803,-0.201854,-0.028731,-0.367388,-0.338657,0.108573,0.142617,0.072834,0.038446,-0.241865,-0.034388,-0.314700,-0.280312


In [113]:
for name,dat in {'train':train, 'test':test, 'val':val}.items():
    dat.to_csv(f"Preds/preds_{name}.csv", index=0)