In [13]:
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, SelectFwe, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.decomposition import PCA
from concurrent.futures import ProcessPoolExecutor
import pandas as pd
import numpy as np
import warnings

In [40]:
def produce_modifications(X_train, y_train, train_indices, target_indices, X_val, y_val_na):
    for i in target_indices:

        if i in train_indices:
            mod0 = np.copy(y_train)
            mod0[i] = 1 - mod0[i]
            yield X_train, mod0, train_indices, X_val, y_val_na

            mod1 = list(train_indices)
            mod1.remove(i)
            yield X_train, y_train, mod1, X_val, y_val_na

        else:
            mod0 = list(train_indices)
            mod0.append(i)
            yield X_train, y_train, mod0, X_val, y_val_na

            mod1 = np.copy(y_train)
            mod1[i] = 1 - mod1[i]
            yield X_train, mod1, mod0, X_val, y_val_na

def test_modification(test):
    X_train, y_train, train_indices, X_val, y_val_na = test
    
    clf = LinearRegression(copy_X=True)
    clf.fit(X_train[train_indices],y_train[train_indices])
    new_error = clf.residues_
    
    return new_error, y_train, train_indices

def ce_squared(T, probs):
    return ((T*probs)**2).sum()/len(probs)

def process_dataset(X_train, y_train, X_test):
    # Select top 50 features for mutual info with y
    minfos = mutual_info_classif(X_train, y_train) 
    minds = minfos.argsort()[-50:]
    X_train = X_train[:, minds]
    
    # Map it down to 30 dims while preserving ~99% of variance
    pca = PCA(n_components=30) 
    X_train = pca.fit_transform(X_train)
    
    # Transform X_test
    X_test = pca.transform(X_test[:,minds])
    np.random.shuffle(X_test)
    
    # Undersample & seperate a validation set
    pinds = np.where(y_train == 1)[0] 
    ninds = np.where(y_train == 0)[0]
    
    np.random.shuffle(ninds)
    
    train_ninds = ninds[:500]
    val_ninds = ninds[-300:]
    
    train_pinds = pinds[:300]
    val_pinds = pinds[-100:]
    
    X_tr = np.vstack((X_train[train_pinds], X_train[train_ninds]))
    y_tr = np.append(y_train[train_pinds], y_train[train_ninds])
    
    X_va = np.vstack((X_train[val_pinds], X_train[val_ninds]))
    y_va = np.append(y_train[val_pinds], y_train[val_ninds])
    
    tr_inds = list(range(X_tr.shape[0]))
    np.random.shuffle(tr_inds)
    
    X_train = X_tr[tr_inds]
    y_train = y_tr[tr_inds]
    
    va_inds = list(range(X_va.shape[0]))
    np.random.shuffle(va_inds)
    
    X_val = X_va[va_inds]
    y_val = y_va[va_inds]
    
    return X_train, y_train, X_val, y_val, X_test    

def modify_dataset(X_train, y_train, X_val, y_val):
    start_ind = 0
    batch_size = 50
    end_ind = start_ind + batch_size

    y_val_na = y_val[:, np.newaxis]
    y_val_na = np.append(y_val_na, 1-y_val_na, axis=1)
    
    clf = LinearRegression(copy_X=True)
    clf.fit(X_train, y_train)
    
    best_error = clf.residues_
    best_y_train = y_train
    best_train_indices = list(range(X_train.shape[0]))

    while end_ind <= X_train.shape[0]:
        target_indices = range(start_ind, end_ind)
        mods = produce_modifications(X_train, best_y_train, best_train_indices, target_indices, X_val, y_val_na)

        test_results = list(map(test_modification, mods))
        test_results.append((best_error, best_y_train, best_train_indices))
        best_error, best_y_train, best_train_indices = min(test_results, key=lambda x: x[0])

        print('Processed: {:5d} samples,\tcurrent error is {:0.4f}'.format(end_ind, best_error))
        start_ind += batch_size
        end_ind += batch_size
        
    return X_train[best_train_indices], best_y_train[best_train_indices]

In [41]:
X_train = np.array(pd.read_csv('train.csv', header=None))
y_train = np.array(pd.read_csv('trainlabels.csv', header=None)).ravel()
X_test = np.array(pd.read_csv('test.csv', header=None))

X_train, y_train, X_val, y_val, X_test = process_dataset(X_train, y_train, X_test)

In [42]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    warnings.warn("deprecated", DeprecationWarning)
    X, y = modify_dataset(X_train, y_train, X_val, y_val)

(800, 30)
Processed:    50 samples,	current error is 74.8882
Processed:   100 samples,	current error is 74.3868
Processed:   150 samples,	current error is 74.0643
Processed:   200 samples,	current error is 73.4349
Processed:   250 samples,	current error is 72.9065
Processed:   300 samples,	current error is 72.4502
Processed:   350 samples,	current error is 68.9609
Processed:   400 samples,	current error is 68.5031
Processed:   450 samples,	current error is 67.9366
Processed:   500 samples,	current error is 67.5156
Processed:   550 samples,	current error is 66.9200
Processed:   600 samples,	current error is 66.3581
Processed:   650 samples,	current error is 65.7237
Processed:   700 samples,	current error is 64.9064
Processed:   750 samples,	current error is 64.2175
Processed:   800 samples,	current error is 63.6037


X: uncorrelated X_train: 0.74
X2: top 50 mi values of X: 0.81
X3: top 30 mi values of X: 0.87
X4: top 20 mi values of X: 0.80
X3: mi values of X_train

In [44]:
X_va = np.vstack((X, X_val))
y_va = np.append(y, y_val)

svm = SVC(class_weight='balanced', cache_size=1000)
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_treshold=True)
svm.fit(X, y)
clf = svm

In [22]:
#probs = svm.predict_log_proba(X_test[:,minds])[:,1]
#roc_auc_score(y_val, probs)
probs

array([-6.74147731, -9.08298884, -5.77983652, ..., -6.18960352,
       -4.88588154, -2.24850981])

In [45]:
preds = clf.decision_function(X_test)
preds = pd.DataFrame(preds)
preds.index += 1
preds.to_csv('out.csv', index_label='Id', header=['Prediction'])