In [1]:
import numpy as np
import pandas as pd

## Loading train\test sets for first of dataframes

In [99]:
train_1 = pd.read_excel("train1.xlsx")
test_1 = pd.read_excel("recog1.xlsx")

In [100]:
train_1.head(5)

Unnamed: 0,noma,im,ecg_id,sys,MB0,MB1,MB2,MSEP,MSRP,MDP,...,msh22,msh23,msh24,msh25,msh26,msh27,msh28,msh29,msh30,msh31
0,26,1,45909,120,0,0,-1,0,0,0,...,6.987262,5.842897,6.949771,5.843638,6.93523,5.821854,6.910798,5.806775,6.897684,5.639776
1,35,1,46125,120,251,176,343,280,373,551,...,6.604633,5.460614,6.555808,5.436133,6.521528,5.41535,6.498449,5.406096,6.486808,5.397264
2,36,1,46153,120,0,0,-1,0,0,0,...,7.617065,6.454835,7.626573,6.351499,7.605313,6.365222,7.579783,6.367527,7.572513,5.750414
3,39,1,46221,118,0,0,-1,0,0,0,...,6.729358,5.666262,6.690083,5.614659,6.655641,5.615052,6.642894,5.576259,6.636607,5.491374
4,43,1,46311,116,232,165,396,263,399,517,...,6.722442,5.772871,6.679197,5.740183,6.646341,5.716681,6.624495,5.703178,6.611261,5.690305


In [101]:
train_1['im'].unique()

array([1, 0])

In [102]:
train_1.dtypes.unique()

array([dtype('int64'), dtype('float64')], dtype=object)

Seems like it consists of exclusively categorical\numeric features, which can be named as "good" - no real preparations would be needed. Let's separate target from df, and check each of features for their type

#### Separation

In [103]:
train_targ = np.array(train_1['im']) * 2 - 1
test_targ = np.array(test_1['im']) * 2 - 1

train_1.drop(columns = 'im', inplace=True)
test_1.drop(columns = 'im', inplace=True)

#### Checking - categorical feature

In [9]:
cat = []

for column in train_1.columns:
    
    sub = train_1[column].unique().__len__() 
    
    if sub < 400:
        cat.append((column, sub))

In [10]:
cat

[('sys', 48),
 ('MB0', 104),
 ('MB1', 84),
 ('MB2', 188),
 ('MSEP', 120),
 ('MSRP', 112),
 ('MDP', 137),
 ('MRR', 212),
 ('MdSEP', 299),
 ('MdSRP', 295),
 ('MdDP', 301),
 ('MS5S2', 10),
 ('BR', 92),
 ('quality', 57),
 ('msh0', 219)]

Whatever, let's call them all numerical

## Algorythm selection

In [16]:
free_cols = train_1.columns

#### Models

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
model_grad = GradientBoostingClassifier()

#### Random subsamples

In [17]:
from sklearn.model_selection import KFold

In [77]:
states = np.concatenate([np.random.rand(3) * 100,
                         np.random.rand(3) * 500,
                         np.random.rand(3) * 1000]).astype(int)

In [78]:
states

array([ 12,  38,  82, 213,  12, 248, 988, 702, 640])

In [81]:
cv_s = []

for state in states:
    
    cv = KFold(n_splits = 4, random_state=state, shuffle=True)
    cv_s.append(cv)

#### Distances Transformation

In [26]:
def build_distances(train_line, test_line):
    
    train_pre_arr = np.absolute(train_line[None, :] - train_line[:, None])
    test_pre_arr = np.absolute(train_line[None, :] - test_line[:, None])
    
    train_max = train_pre_arr.max() + 0.01

    return train_pre_arr / train_max, test_pre_arr / train_max

In [41]:
def build_pairwise(train_set, test_set, used_cols, fill_train = 'mean'):
    
    train_len = train_set.__len__()
    test_len = test_set.__len__()
    
    train_pairwise = np.zeros((train_len, train_len))
    test_pairwise = np.zeros((test_len, train_len))
    
    for column in used_cols:
        
        if column not in train_set.columns:
            raise TypeError
            
        train_line = np.array(train_set[column])
        test_line = np.array(test_set[column])
        
        train_append, test_append = build_distances(train_line, test_line)
        
        if fill_train == 'mean':
            
            np.fill_diagonal(train_append, np.nan)
            mean_val = np.nanmean(train_append)
            np.fill_diagonal(train_append, mean_val)
        
        train_pairwise += train_append
        test_pairwise += test_append
        
    return train_pairwise, test_pairwise

#### RF

In [44]:
from sklearn.metrics import roc_auc_score

In [60]:
model_forest = RandomForestClassifier()

In [116]:
def calculate_cv_roc(dataset, used_cols, model, uses_pairwise=True):
    
    cv_roc = 0
        
    for cv in cv_s:

        for train_indixes, test_indixes in cv.split(train_1):
            
            #print(test_indixes)

            train_subset = train_1.loc[train_indixes]
            test_subset = train_1.loc[test_indixes]

            train_target = train_targ[train_indixes]
            test_target = train_targ[test_indixes]

            if uses_pairwise:
                
                train_pairwise, test_pairwise = build_pairwise(train_subset, test_subset, used_cols)
                model.fit(train_pairwise, train_target)

                preds = model.predict_proba(test_pairwise)[:, 1]
                cv_roc += roc_auc_score(test_target, preds)
            
            else:
                
                model.fit(train_subset, train_target)
                
                preds = model.predict_proba(test_subset)[:, 1]
                cv_roc += roc_auc_score(test_target, preds)

    cv_roc /= cv_s.__len__() * cv_s[0].get_n_splits()
    
    return cv_roc

In [117]:
def optimizer(dataset, model, uses_pairwise=True):
    rf_used = []
    rf_not_used = list(dataset.columns)

    best_result = 0

    while True:

        current_flag = False
        current_append = True
        item = None

        for column in rf_not_used:

            current_cols = rf_used + [column]

            cv_roc = calculate_cv_roc(dataset, current_cols, model, uses_pairwise)

            if cv_roc > best_result:
                current_flag = True
                best_result = cv_roc

                current_append = True
                item = column
        
        if rf_used.__len__() > 1:
            
            for column in rf_used:

                current_cols = rf_used.copy()
                current_cols.remove(column)

                cv_roc = calculate_cv_roc(dataset, current_cols, model, uses_pairwise)

                if cv_roc > best_result:

                    current_flag = True
                    best_result = cv_roc

                    current_append = False
                    item = column

        if current_flag:

            if current_append:

                rf_used.append(item)
                rf_not_used.remove(item)

            else:

                rf_used.remove(item)
                rf_not_used.append(item)

            print(rf_used, best_result)

        else:
            break
            
    return rf_used

In [91]:
v = optimizer(train_1, model_grad)

['sys'] 1.0


KeyboardInterrupt: 

## WTF?

In [118]:
train_1.drop(columns="sys", inplace=True)

KeyError: "['sys'] not found in axis"

In [119]:
v = optimizer(train_1, model_grad)

['spq'] 0.7918214014946081
['spq', 'sqrs'] 0.901642520557596


KeyboardInterrupt: 

In [None]:
v

## Fitting into test set

In [96]:
def predict(train_ds, train_targ, test_ds, test_targ, model, cols):
    
    train_pairwise, test_pairwise = build_pairwise(train_ds, test_ds, cols)
    
    model.fit(train_pairwise, train_targ)
    
    preds = model.predict_proba(test_pairwise)[:, 1]
    
    return roc_auc_score(test_targ, preds)

In [97]:
predict(train_1, train_targ, test_1, test_targ, model_grad, v)

0.7403632320237212

In [104]:
predict(train_1, train_targ, test_1, test_targ, model_grad, train_1.columns)

0.8014733135656041

In [109]:
model_grad.fit(train_1, train_targ)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [110]:
roc_auc_score(test_targ, model_grad.predict_proba(test_1)[:, 1])

0.8680504077094143

In [108]:
train_1.drop(columns='sys', inplace=True)
test_1.drop(columns='sys', inplace=True)