In [98]:
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, SelectFwe, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, ParameterGrid, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score, make_scorer
import pandas as pd
import numpy as np

In [2]:
X_train = np.array(pd.read_csv('train.csv', header=None)) #load the dataset
y_train = np.array(pd.read_csv('trainlabels.csv', header=None)).reshape(51147)
X_test = np.array(pd.read_csv('test.csv', header=None))

In [112]:
pinds = np.where(y_train == 1)[0] # indiceds for classes
ninds = np.where(y_train == 0)[0]
np.random.shuffle(ninds)
ninds = ninds[:1000]
x = np.vstack((X_train[pinds], X_train[ninds]))
y = np.append(y_train[pinds], y_train[ninds])

In [4]:
train_inds, val_inds = next(StratifiedKFold(n_splits=10, shuffle=True).split(X_train, y_train))

X_val, y_val = X_train[val_inds], y_train[val_inds]
X_train, y_train = X_train[train_inds], y_train[train_inds]

In [3]:
X_train.shape

(51147, 117)

In [116]:
minfos = mutual_info_classif(x,y)

In [126]:
minfos.argsort()[-50:]

0.23671708120937307

In [123]:
a = SelectFwe(f_classif, 0.05)
x2 = a.fit_transform(x,y)
x2.shape

(1400, 82)

In [128]:
minds = minfos.argsort()[-50:]
x3 = x[:, minds]
x3.shape

(1400, 50)

In [131]:
pca = PCA(n_components=30)
x = pca.fit_transform(x3)

In [132]:
pca.explained_variance_ratio_.sum()

0.99525682859808473

X: uncorrelated X_train: 0.74

X2: top 50 mi values of X: 0.81

X3: top 30 mi values of X: 0.87

X4: top 20 mi values of X: 0.80

X5: 20 pca on x4: 0.90, 0.74

X6: 30 pca on top 50 mi of reduced x: 0.889

In [133]:
c = 10 ** np.arange(-3, 3).astype('float')
g = 10 ** np.arange(-3, 3).astype('float')
#params = ParameterGrid({'C':c, 'gamma':g})
params = {'C':c, 'gamma':g}

In [134]:
auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True) 
svm = SVC(probability=True, class_weight='balanced', cache_size=1000)

In [138]:
clf = GridSearchCV(svm, params, scoring=auc)

In [139]:
clf.fit(x, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=1000, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02]), 'gamma': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
         1.00000e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(roc_auc_score, needs_threshold=True), verbose=0)

In [143]:
pd.DataFrame(clf.cv_results_).max()

mean_fit_time          0.440193
mean_score_time        0.029438
mean_test_score        0.925692
mean_train_score       1.000000
rank_test_score       31.000000
split0_test_score      0.942488
split0_train_score     1.000000
split1_test_score      0.941588
split1_train_score     1.000000
split2_test_score      0.920454
split2_train_score     1.000000
std_fit_time           0.022838
std_score_time         0.004346
std_test_score         0.022124
std_train_score        0.007979
dtype: float64

In [149]:
preds = pd.DataFrame(clf.decision_function(xt))

In [151]:
preds.index += 1
preds.columns = ['Prediction']
preds.to_csv('out.csv')

In [147]:
xt = pca.transform(X_test[:,minds])

In [40]:
probs = clf.predict_log_proba(xt)[:,1]
#roc_auc_score(y_val, probs)

In [41]:
probs = pd.DataFrame(probs)
probs.index.name = 'Id'
probs.index += 1
probs.columns = ['Prediction']
probs.to_csv('out.csv')
probs

Unnamed: 0,Prediction
1,-5.209228
2,-5.536764
3,-4.879517
4,-5.544248
5,-5.755104
6,-6.241813
7,-6.111410
8,-5.164875
9,-6.054682
10,-5.314105
