In [1]:
import scipy.sparse
%cd "~/moses-incons-pen-xp"
%load_ext autoreload
%autoreload 2

/home/xabush/moses-incons-pen-xp


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, recall_score, f1_score
from notebooks.manifold_reg.MosesEstimator import *
from notebooks.manifold_reg.util import *

In [3]:
df_train, df_test = pd.read_csv("data/bmm/data_train.csv"), pd.read_csv("data/bmm/data_test.csv")

In [4]:
X_train, y_train = df_train[df_train.columns.difference(["y"])].to_numpy(), df_train["y"].to_numpy()
X_test, y_test = df_test[df_test.columns.difference(["y"])].to_numpy(), df_test["y"].to_numpy()
X_train.shape

(70, 1000)

In [5]:
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)

In [6]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([49, 21]))

In [7]:
kcv = KFold(n_splits=5)

In [48]:
alphas = np.logspace(-3, 3, 10)
alphas = np.concatenate((np.array([0.0]) ,alphas))
alphas

array([0.00000000e+00, 1.00000000e-03, 4.64158883e-03, 2.15443469e-02,
       1.00000000e-01, 4.64158883e-01, 2.15443469e+00, 1.00000000e+01,
       4.64158883e+01, 2.15443469e+02, 1.00000000e+03])

In [8]:
cmplx_ratios = np.logspace(0, 3, 10)
cmplx_ratios

array([   1.        ,    2.15443469,    4.64158883,   10.        ,
         21.5443469 ,   46.41588834,  100.        ,  215.443469  ,
        464.15888336, 1000.        ])

### Grid search using 5-fold cross-validation for $\alpha$ values and default complexity ratio (3.5) - using both BP and MI scorers

In [112]:
from notebooks.manifold_reg.MosesEstimator import *
cv_1 = GridSearchCV(MosesEstimator(scorers=["bp", "mi"], assoc_mat="data/bmm/feat_net.npy", threshold=0.01, fs_algo="random"), {"alpha": alphas}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_1 = np.mean(cross_val_score(cv_1.best_estimator_, X_train, y_train))
cv_1.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_1.best_params_}, cv score: {sc_1}, test score: {cv_1.best_estimator_.score(X_test, y_test)}")

Fitting 5 folds for each of 11 candidates, totalling 55 fits
/tmp/tmpyza4gv82
100 parsed!
/tmp/tmpkuomzf45
100 parsed!
/tmp/tmpjhjxrrpc
100 parsed!
/tmp/tmpvnkks5e8
100 parsed!
/tmp/tmpdyglc7ht
100 parsed!
/tmp/tmpunhjnl54
100 parsed!
/tmp/tmp9yypgspt
100 parsed!
Best params: {'alpha': 10.0}, cv score: 0.5555808080808081, test score: 0.46031746031746035


### Grid search using 5-fold cross-validation for $\alpha$ values and complexity ratio ($C$) values - using both BP and MI scorers

In [113]:
from notebooks.manifold_reg.MosesEstimator import *
cv_2 = GridSearchCV(MosesEstimator(scorers=["bp", "mi"], assoc_mat="data/bmm/feat_net.npy", fs_algo="random"), {"alpha": alphas, "complexity_ratio": cmplx_ratios}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_2 = np.mean(cross_val_score(cv_2.best_estimator_, X_train, y_train))
cv_2.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_2.best_params_}, cv score: {sc_2}, test score: {cv_2.best_estimator_.score(X_test, y_test)}")

Fitting 5 folds for each of 110 candidates, totalling 550 fits
/tmp/tmpjrb3ezz2
100 parsed!
/tmp/tmp5d6l5t_m
100 parsed!
/tmp/tmp16t5lyaa
100 parsed!
/tmp/tmpqblurasa
100 parsed!
/tmp/tmpfq33z_rm
100 parsed!
/tmp/tmp4n1pxlxl
100 parsed!
/tmp/tmpl6o80hl1
100 parsed!
Best params: {'alpha': 46.41588833612773, 'complexity_ratio': 2.154434690031884}, cv score: 0.6455808080808081, test score: 0.5714285714285714


In [32]:
cv_2.best_estimator_.models_[0].model

'and(or(and(or(!$f157 $f639) $f912) and(or($f228 $f509 !$f583 !$f694 !$f917) !$f382) and(!$f157 $f912) $f164 $f719) or(and($f164 !$f762) and($f228 !$f382) $f79 $f217))'

In [None]:
cv_2.best_score_

### Grid search using 5-fold cross-validation for complexity ratio values - using MI scorer

In [114]:
from notebooks.manifold_reg.MosesEstimator import *
cv_3 = GridSearchCV(MosesEstimator(scorers=["mi"], assoc_mat="data/bmm/feat_net.npy", fs_algo="random"), {"complexity_ratio": cmplx_ratios}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_3 = np.mean(cross_val_score(cv_3.best_estimator_, X_train, y_train))
cv_3.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_3.best_params_}, cv score: {sc_3}, test score: {cv_3.best_estimator_.score(X_test, y_test)}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
/tmp/tmp7ntx76f_
100 parsed!
/tmp/tmpwzkety_z
100 parsed!
/tmp/tmp3sej_q1h
100 parsed!
/tmp/tmppqo1_dy9
100 parsed!
/tmp/tmpsd3hfhu_
100 parsed!
/tmp/tmpvzq1iyoh
100 parsed!
/tmp/tmpe3l259mg
100 parsed!
Best params: {'complexity_ratio': 464.15888336127773}, cv score: 0.5701515151515151, test score: 0.4841269841269842


In [31]:
cv_3.best_estimator_.models_[0].model

'or(and(or(and(or(and(!$f270 $f972) !$f476 $f883 !$f909) $f350) and(or($f465 !$f765 !$f909) $f317) and($f151 $f726) $f295) $f720) and(!$f909 $f972))'

### Grid search using 5-fold cross-validation for $\alpha$ values and complexity ratio values - using BP scorer

In [None]:
from notebooks.manifold_reg.MosesEstimator import *
cv_4 = GridSearchCV(MosesEstimator(scorers=["mi"], assoc_mat="data/bmm/feat_net.npy", fs_algo="random"), {"alpha": alphas, "complexity_ratio": cmplx_ratios}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_4 = np.mean(cross_val_score(cv_4.best_estimator_, X_train, y_train))
cv_4.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_4.best_params_}, cv score: {sc_4}, test score: {cv_4.best_estimator_.score(X_test, y_test)}")

### Grid search using 5-fold cross-validation for $\alpha$ values, complexity ratio values and feature target size - using both BP and MI scorers

In [117]:
from notebooks.manifold_reg.MosesEstimator import *
cv_5 = GridSearchCV(MosesEstimator(scorers=["mi", "bp"], assoc_mat="data/bmm/feat_net.npy", fs_algo="random"), {"alpha": alphas, "complexity_ratio": cmplx_ratios, "fs_size": [5, 10, 15, 20]}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_5 = np.mean(cross_val_score(cv_5.best_estimator_, X_train, y_train))
cv_5.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_5.best_params_}, cv score: {sc_5}, test score: {cv_5.best_estimator_.score(X_test, y_test)}")

Fitting 5 folds for each of 440 candidates, totalling 2200 fits
/tmp/tmpmng8u1wf
100 parsed!
[CV] END ........alpha=0.0, complexity_ratio=1.0, fs_size=10; total time=  27.4s
/tmp/tmpi0n2vhu9
100 parsed!
[CV] END ........alpha=0.0, complexity_ratio=1.0, fs_size=15; total time=  24.5s
/tmp/tmp6x_idgri
100 parsed!
[CV] END ........alpha=0.0, complexity_ratio=1.0, fs_size=20; total time=  29.0s
/tmp/tmphy6d8r8v
100 parsed!
[CV] END alpha=0.0, complexity_ratio=2.154434690031884, fs_size=10; total time= 1.2min
/tmp/tmp1xmru9z4
100 parsed!
[CV] END alpha=0.0, complexity_ratio=2.154434690031884, fs_size=20; total time=  47.2s
/tmp/tmpqvlokvlk
100 parsed!
[CV] END alpha=0.0, complexity_ratio=4.641588833612778, fs_size=10; total time=  38.0s
/tmp/tmpb2tqhoq_
100 parsed!
[CV] END alpha=0.0, complexity_ratio=4.641588833612778, fs_size=15; total time=  53.1s
/tmp/tmpw956lsis
100 parsed!
[CV] END ........alpha=0.0, complexity_ratio=10.0, fs_size=5; total time= 1.1min
/tmp/tmp8p53wy9m
100 parsed!
[CV

Traceback (most recent call last):
  File "/home/xabush/miniconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 762, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/xabush/miniconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 418, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
  File "/home/xabush/moses-incons-pen-xp/notebooks/manifold_reg/MosesEstimator.py", line 92, in score
    y_pred = self.predict(X)
  File "/home/xabush/moses-incons-pen-xp/notebooks/manifold_reg/MosesEstimator.py", line 84, in predict
    res = self._eval_models(self.models_, X_df)
  File "/home/xabush/moses-incons-pen-xp/notebooks/manifold_reg/MosesEstimator.py", line 165, in _eval_models
    raise e
  File "/home/xabush/moses-incons-pen-xp/notebooks/manifold_reg/MosesEstimator.py", line 158, in _eval_models
    model_outs[i] = y_pred
ValueError: could not broadcast input array from shape (15) into shape (14)



/tmp/tmpph8ht569
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=10.0, fs_size=5; total time=  17.3s
/tmp/tmp16_lxaev
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=10.0, fs_size=10; total time=  20.0s
/tmp/tmpm5xjcf24
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=10.0, fs_size=20; total time=  18.6s
/tmp/tmpmfbbd5fx
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=21.544346900318832, fs_size=5; total time=  12.3s
/tmp/tmpbgaby1w4
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=21.544346900318832, fs_size=15; total time=  17.9s
/tmp/tmprgl_a1m7
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=21.544346900318832, fs_size=20; total time=  22.2s
/tmp/tmpn3wqohku
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=46.41588833612777, fs_size=5; total time=  22.5s
/tmp/tmp9m6heje7
100 parsed!
[CV] END alpha=0.46415888336127775, complexity_ratio=46.41588833612777, fs_size=15; to

 0.55027056 0.55681818 0.52451299 0.50844156 0.49117965 0.59875541
 0.46417749 0.50519481 0.50784632 0.51390693 0.49772727 0.55308442
 0.49686147 0.54686147 0.51049784 0.53955628 0.50833333 0.57034632
 0.47451299 0.53701299 0.56872294 0.5961039  0.5137987  0.53398268
 0.57364719 0.61201299 0.46277056 0.58311688 0.58452381 0.62943723
 0.53300866 0.57554113 0.58376623 0.62943723 0.55102814 0.44025974
 0.4530303  0.52840909 0.55027056 0.44659091 0.5469697  0.45898268
 0.57619048 0.48203463 0.58701299 0.4155303  0.53095238 0.50465368
 0.49458874 0.44734848 0.5896645  0.53208874 0.4517316  0.4107684
 0.54431818 0.45708874 0.42510823 0.39334416 0.47689394 0.55470779
 0.45719697 0.43652597 0.48598485 0.49307359 0.46628788 0.45243506
 0.53804113 0.45064935 0.48474026 0.45243506 0.5262987  0.45719697
 0.5650974  0.46152597 0.55102814 0.44025974 0.4530303  0.52840909
 0.55027056 0.44659091 0.5469697  0.45898268 0.57619048 0.48203463
 0.58701299 0.4155303  0.53095238 0.50465368 0.49458874 0.44734

/tmp/tmp7rtcn07p
100 parsed!
/tmp/tmpovape3mf
100 parsed!
/tmp/tmpbcb1uitq
100 parsed!
/tmp/tmpgn_x6f35
100 parsed!
/tmp/tmp91j_kg5_
100 parsed!
/tmp/tmpru8uoo_o
100 parsed!
/tmp/tmp9tgsb247
100 parsed!
Best params: {'alpha': 46.41588833612773, 'complexity_ratio': 2.154434690031884, 'fs_size': 10}, cv score: 0.6455808080808081, test score: 0.5714285714285714


In [None]:
from notebooks.manifold_reg.MosesEstimator import *
cv_6 = GridSearchCV(MosesEstimator(scorers=["mi", "bp"], assoc_mat="data/bmm/feat_net.npy", fs_algo="smd", threshold=1e-3), {"alpha": alphas, "complexity_ratio": cmplx_ratios, "fs_size": [5, 10, 15, 20]}, cv = kcv, verbose=1, n_jobs=10).fit(X_train, y_train)
sc_6 = np.mean(cross_val_score(cv_6.best_estimator_, X_train, y_train))
cv_6.best_estimator_.fit(X_train, y_train)
print(f"Best params: {cv_6.best_params_}, cv score: {sc_6}, test score: {cv_6.best_estimator_.score(X_test, y_test)}")

Fitting 5 folds for each of 440 candidates, totalling 2200 fits


In [128]:
moses_est.score(X_train, y_train)

/tmp/tmptu329ty7
/tmp/tmplhst64g1
/tmp/tmpdrgcsl16
/tmp/tmpq5kgt1ec
/tmp/tmp50wquuk3
/tmp/tmpmfvhxycy
/tmp/tmpmh0nxak3
/tmp/tmpzqd055ce
/tmp/tmp0sjvazwg
/tmp/tmp5lghx6hf
/tmp/tmphiwe9_t2
/tmp/tmp77kdy6tt
/tmp/tmpuqcpr14f
/tmp/tmpy1cte0gf
/tmp/tmp3yn8xr_c
/tmp/tmp28t26myl
/tmp/tmpvkzev4cc
/tmp/tmpx9i7hg12
/tmp/tmpridtr116
/tmp/tmprxw5gk9b
/tmp/tmpnmc4bkwx
/tmp/tmpghgae0o9
/tmp/tmp33jz94rj
/tmp/tmpt2gfhcc5
/tmp/tmpptqafecv
/tmp/tmpflqnmljb
/tmp/tmp64ljv46g
/tmp/tmpzey958fr
/tmp/tmp39itpjrm
/tmp/tmpsvp60ys6
/tmp/tmp3dipq8i2
/tmp/tmp6czn2q0u
/tmp/tmph3_870w_
/tmp/tmpk8nou3r3
/tmp/tmp8qaxq934
/tmp/tmpqy3fhxge
/tmp/tmp8rxxxwrz
/tmp/tmpf0m66ku6
/tmp/tmphpuj4cqr
/tmp/tmpf5nwq0x3
/tmp/tmp56ygy9b_
/tmp/tmpo7bp6bsd
/tmp/tmpswqy0lgi
/tmp/tmptheu1_i5
/tmp/tmp7ulb0iqc
/tmp/tmpbycbsol8
/tmp/tmphvslo90a
/tmp/tmpt7kuohw0
/tmp/tmp393jnv9g
/tmp/tmpb7jmfh82
/tmp/tmp0rve1g3t
/tmp/tmp9k1e2e1t
/tmp/tmpbyc1dhl6
/tmp/tmp60oj33c9
/tmp/tmp6jw60l08
/tmp/tmpnd8p104t
/tmp/tmp15k7lz8f
/tmp/tmpy8yt5y4k
/tmp/tmpog719x

0.8095238095238095

In [116]:
moses_est.models_[0].model

'and(or(and(or(and(or($f117 $f217) $f514) $f509 !$f694) !$f382) and(or(!$f622 !$f917) !$f509 $f514 !$f991)) !$f817 !$f957)'

In [56]:
assoc_mat = np.load("data/bmm/feat_net.npy")
# assoc_mat = np.abs(assoc_mat)
assoc_mat

array([[1.     , 0.     , 1.3997 , ..., 1.     , 0.     , 0.     ],
       [0.     , 1.     , 0.     , ..., 0.     , 1.4212 , 0.     ],
       [1.3997 , 0.     , 1.     , ..., 0.     , 0.     , 0.17373],
       ...,
       [1.     , 0.     , 0.     , ..., 1.     , 0.     , 0.     ],
       [0.     , 1.4212 , 0.     , ..., 0.     , 1.     , 1.1482 ],
       [0.     , 0.     , 0.17373, ..., 0.     , 1.1482 , 1.     ]])

In [60]:
dist_mat, precd = scipy.sparse.csgraph.shortest_path(scipy.sparse.csc_matrix(assoc_mat), directed=False, method="J", return_predecessors=True)
dist_mat[1, np.array([79, 260, 293, 637, 839])]

array([0.043899 , 0.0427118, 0.0473448, 0.0342591, 0.0396434])

In [119]:
from notebooks.manifold_reg.MosesEstimator import *

feature_count(cv_5.best_estimator_.models_, [79, 260, 293, 637, 839])

{0: {'count': 8, 'causal': 'No'},
 514: {'count': 100, 'causal': 'No'},
 622: {'count': 100, 'causal': 'No'},
 944: {'count': 100, 'causal': 'No'},
 917: {'count': 100, 'causal': 'No'},
 694: {'count': 100, 'causal': 'No'},
 855: {'count': 100, 'causal': 'No'},
 118: {'count': 100, 'causal': 'No'},
 509: {'count': 100, 'causal': 'No'},
 382: {'count': 100, 'causal': 'No'},
 991: {'count': 100, 'causal': 'No'},
 26: {'count': 8, 'causal': 'No'},
 840: {'count': 7, 'causal': 'No'},
 12: {'count': 2, 'causal': 'No'},
 24: {'count': 8, 'causal': 'No'},
 48: {'count': 8, 'causal': 'No'},
 152: {'count': 6, 'causal': 'No'},
 189: {'count': 7, 'causal': 'No'},
 310: {'count': 4, 'causal': 'No'},
 339: {'count': 8, 'causal': 'No'},
 395: {'count': 1, 'causal': 'No'},
 459: {'count': 5, 'causal': 'No'},
 584: {'count': 8, 'causal': 'No'},
 496: {'count': 6, 'causal': 'No'},
 937: {'count': 5, 'causal': 'No'},
 491: {'count': 2, 'causal': 'No'},
 921: {'count': 1, 'causal': 'No'}}