In [1]:
%matplotlib inline
import scipy as sc
import pandas as pd
import ggplot as gg
from sklearn import datasets, model_selection, decomposition, ensemble, neural_network, svm,\
preprocessing, pipeline, linear_model
from tensorflow.contrib import learn as ln
import tensorflow as tf

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels

class ELMClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, h=10, af=sc.special.expit):
        self.h = h
        self.af = af
        self.binarizer = preprocessing.LabelBinarizer()

    def fit(self, X, y):
        # Check that X and y have correct shape
        X, y = check_X_y(X, y)
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        y_bin = self.binarizer.fit_transform(y)
        self.W0=sc.stats.norm().rvs([X.shape[1],self.h])
        X0=self.af(self.W0.T@X.T)
        self.W=sc.linalg.pinv(X0.T)@y_bin
        return self

    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, ['W',])
        # Input validation
        X = check_array(X)
        raw_predictions = self.W.T@self.af(self.W0.T@X.T)
        class_predictions = self.binarizer.inverse_transform(raw_predictions.T)
        return class_predictions

In [3]:
Xo,ro=datasets.load_digits(return_X_y=True)
Xo.shape

(1797, 64)

In [4]:
Xo,ro=datasets.load_iris(return_X_y=True)
Xo.shape

(150, 4)

In [3]:
data=pd.read_csv('transfusion.data')
ro = data[data.columns[4]]
Xo = data[data.columns[:4]]
Xo.shape

(748, 4)

In [4]:
test_data = pd.read_csv('5c9fa979-5a84-45d6-93b9-543d1a0efc41.csv')
data = pd.read_csv('9db113a1-cdbe-4b1c-98c2-11590f124dd8.csv')
ro = data[data.columns[5]]
Xo = data[data.columns[:5]]
Xot = test_data

In [5]:
ro[0]

1

In [6]:
X,Xt,r,rt = model_selection.train_test_split(Xo,ro,test_size=.2)

In [24]:
feature_columns = [tf.contrib.layers.real_valued_column("", dimension=5)]

classifier = tf.contrib.learn.DNNClassifier(feature_columns=feature_columns,
                                            hidden_units=[5, 5, 5,],
                                            n_classes=2,
                                            model_dir="/tmp/blood_model2",
                                            config=tf.contrib.learn.RunConfig(save_checkpoints_secs=1),
                                           )

validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
    Xt,
    rt,
    every_n_steps=50,
    early_stopping_rounds=200,
)



classifier.fit(x=X,
               y=r,
               steps=200000,
               monitors=[validation_monitor]
              )

accuracy_score = classifier.evaluate(x=Xt,
                                         y=rt)["accuracy"]

print('Accuracy: {0:f}'.format(accuracy_score))

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fefb6a5f940>, '_master': '', '_num_ps_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 1, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000}
Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))
Instructions for updating:
Estimator is decoupled from Scikit Learn interface by movi

rscv=model_selection.RandomizedSearchCV(
    pipeline.make_pipeline(
        ensemble.RandomForestClassifier(),
    ),
    dict(
        randomforestclassifier__n_estimators=sc.stats.randint(1,100),
        #randomforestclassifier__min_samples_split=sc.stats.randint(2,10),
        randomforestclassifier__max_features=sc.stats.uniform(0,1),
    ),
    n_iter=30,
    n_jobs=-1,
    cv=10,
    verbose=True,
)
rscv.fit(X,r)
rscv.score(Xt,rt)

In [31]:
rscv=ensemble.GradientBoostingClassifier(n_estimators=100)
rscv.fit(X,r)
rscv.score(Xt,rt)

0.85344827586206895

In [143]:
res=rscv.predict_proba(Xot).T[1]
subm=pd.read_csv('BloodDonationSubmissionFormat.csv')
subm[subm.columns[1]] = res

In [151]:
subm=subm.set_index(subm.columns[0])

In [152]:
subm.to_csv('subm.csv')

In [128]:
rscv=ensemble.GradientBoostingClassifier(n_estimators=300)
rscv.fit(X,r)
rscv.score(Xt,rt)

0.7068965517241379

In [117]:
rscv=neural_network.MLPClassifier()#solver='adam', hidden_layer_sizes=[1000,])
rscv.fit(X,r)
rscv.score(Xt,rt)

0.74137931034482762

In [127]:
rscv=pipeline.make_pipeline(preprocessing.StandardScaler(),neural_network.MLPClassifier([100,100],max_iter=10000))
rscv.fit(X,r)
rscv.score(Xt,rt)

0.77586206896551724

In [126]:
rscv=pipeline.make_pipeline(preprocessing.StandardScaler(),ELMClassifier(100))
rscv.fit(X,r)
rscv.score(Xt,rt)

0.71551724137931039

In [120]:
rscv=pipeline.make_pipeline(preprocessing.StandardScaler(),svm.SVC(kernel='rbf'))
rscv.fit(X,r)
rscv.score(Xt,rt)

0.75862068965517238

In [121]:
rscv=pipeline.make_pipeline(preprocessing.StandardScaler(),svm.SVC(kernel='poly'))
rscv.fit(X,r)
rscv.score(Xt,rt)

0.73275862068965514

In [122]:
rscv=pipeline.make_pipeline(preprocessing.StandardScaler(),svm.SVC(kernel='linear'))
rscv.fit(X,r)
rscv.score(Xt,rt)

0.74137931034482762

In [82]:
rscv=model_selection.RandomizedSearchCV(
    pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        neural_network.MLPClassifier(max_iter=10000),
    ),
    dict(
        mlpclassifier__solver=['lbfgs', 'adam'],
        mlpclassifier__hidden_layer_sizes=[[x,] for x in range(1,10+1)],
    ),
    n_iter=5,
    n_jobs=-1,
    cv=10,
    verbose=True,
)
rscv.fit(X,r)
rscv.score(Xt,rt)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


0.29310344827586204

In [33]:
rscv=model_selection.RandomizedSearchCV(
    ensemble.GradientBoostingClassifier(),
    dict(
        n_estimators=sc.stats.randint(10,200),
    ),
    n_iter=50,
    n_jobs=-1,
    cv=10,
    verbose=True,
)
rscv.fit(X,r)
rscv.score(Xt,rt)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 376 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   16.8s finished


0.85344827586206895

In [36]:
rscv.best_estimator_
res=rscv.predict_proba(Xot).T[1]
subm=pd.read_csv('BloodDonationSubmissionFormat.csv')
subm[subm.columns[1]] = res

In [34]:
b = rscv.best_estimator_
b.fit(Xo,ro)
b.predict_proba(Xot)

array([[ 0.62254911,  0.37745089],
       [ 0.86480561,  0.13519439],
       [ 0.81282191,  0.18717809],
       [ 0.71282796,  0.28717204],
       [ 0.46083394,  0.53916606],
       [ 0.18081765,  0.81918235],
       [ 0.72417429,  0.27582571],
       [ 0.93043319,  0.06956681],
       [ 0.96724053,  0.03275947],
       [ 0.9597238 ,  0.0402762 ],
       [ 0.88154948,  0.11845052],
       [ 0.85006784,  0.14993216],
       [ 0.59568099,  0.40431901],
       [ 0.80166423,  0.19833577],
       [ 0.85407378,  0.14592622],
       [ 0.89636462,  0.10363538],
       [ 0.86770609,  0.13229391],
       [ 0.67701374,  0.32298626],
       [ 0.91136885,  0.08863115],
       [ 0.76066304,  0.23933696],
       [ 0.36772679,  0.63227321],
       [ 0.54944599,  0.45055401],
       [ 0.48929831,  0.51070169],
       [ 0.84504764,  0.15495236],
       [ 0.76399224,  0.23600776],
       [ 0.69032499,  0.30967501],
       [ 0.68030616,  0.31969384],
       [ 0.89237922,  0.10762078],
       [ 0.79299157,

rscv=svm.SVC(kernel='poly')
rscv.fit(X,r)
rscv.score(Xt,rt)

res = model_selection.cross_val_score(
    svm.SVC(kernel='linear'),
    Xo,
    ro,
    cv=10
)
sc.mean(res)

In [None]:
res = model_selection.cross_val_score(
    ensemble.RandomForestClassifier(2000),
    Xo,
    ro,
    cv=10
)
sc.mean(res)



In [51]:
res = model_selection.cross_val_score(
    ensemble.GradientBoostingClassifier(n_estimators=20),
    Xo,
    ro,
    cv=10
)
sc.mean(res)



0.3058525868184323

In [27]:
res = model_selection.cross_val_score(
    neural_network.MLPClassifier(max_iter=10000, hidden_layer_sizes=[200,]),
    Xo,
    ro,
    cv=10
)
sc.mean(res)



0.073761666416201688

In [23]:
res = model_selection.cross_val_score(
    ELMClassifier(h=10),
    Xo,
    ro,
    cv=10
)
sc.mean(res)



0.082561805060685237

In [24]:
res = model_selection.cross_val_score(
    linear_model.LogisticRegression(),
    Xo,
    ro,
    cv=10
)
sc.mean(res)



0.21057404391055007

rscv=model_selection.RandomizedSearchCV(
    pipeline.make_pipeline(
        preprocessing.StandardScaler(),
        svm.SVC(),
    ),
    dict(
        svc__C=sc.logspace(-5,5,100),#sc.stats.uniform(0,1),
        svc__kernel=['linear','rbf', 'poly'],
        svc__gamma=sc.logspace(-5,5,100),
        svc__degree=sc.stats.randint(1,10)
    ),
    n_iter=5,
    n_jobs=-1,
    cv=10,
    verbose=True,
)
rscv.fit(X,r)
rscv.score(Xt,rt)

In [None]:
rscv=ELMClassifier(h=1000)
rscv.fit(X,r)
rscv.score(Xt,rt)

rscv.best_params_, rscv.best_score_

In [None]:
n=10
xs = [100*x//n for x in range(1,1+n)]
tra,tes=model_selection.validation_curve(
    ensemble.RandomForestClassifier(), 
    X, 
    r, 
    'n_estimators', 
    xs)
da=pd.DataFrame(sc.array([xs, sc.mean(tes,axis=1), sc.mean(tra,axis=1)]).T, columns=['x','tes','tra'])
gg.ggplot(da,gg.aes('x','tes'))+gg.geom_point()+gg.stat_smooth(method='loess')

In [None]:
n=10
xs = [3000*x//n for x in range(1,1+n)]
tra,tes=model_selection.validation_curve(
    ELMClassifier(), 
    X, 
    r, 
    'h', 
    xs)
da=pd.DataFrame(sc.array([xs, sc.mean(tes,axis=1), sc.mean(tra,axis=1)]).T, columns=['x','tes','tra'])
gg.ggplot(da,gg.aes('x','tes'))+gg.geom_point()+gg.stat_smooth(method='loess')

In [None]:
n=10
sca=30
xs = [[sca*x//n,] for x in range(1,1+n)]
tra,tes=model_selection.validation_curve(
    neural_network.MLPClassifier(solver='lbfgs', max_iter=10000), 
    X, 
    r, 
    'hidden_layer_sizes', 
    xs)
xs = [sca*x//n for x in range(1,1+n)]
da=pd.DataFrame(sc.array([xs, sc.mean(tes,axis=1), sc.mean(tra,axis=1)]).T, columns=['x','tes','tra'])
gg.ggplot(da,gg.aes('x','tes'))+gg.geom_point()+gg.stat_smooth(method='loess')

In [None]:
rscv=ELMClassifier(h=10, af=sc.special.expit)#lambda x: x)

In [None]:
rscv.fit(X,r)

In [None]:
rscv.score(Xt,rt)

In [None]:
W0=sc.stats.norm().rvs([X.shape[1],100])
X0=sc.tanh(W0.T@X.T)
W=sc.linalg.pinv(X0.T)@r

In [None]:
sc.sum(abs((W@sc.tanh(W0.T@X.T))==r))